tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf8collationiterator.cpp (17057B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * utf8collationiterator.cpp
      9 *
     10 * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/utf8.h"
     19 #include "charstr.h"
     20 #include "cmemory.h"
     21 #include "collation.h"
     22 #include "collationdata.h"
     23 #include "collationfcd.h"
     24 #include "collationiterator.h"
     25 #include "normalizer2impl.h"
     26 #include "uassert.h"
     27 #include "utf8collationiterator.h"
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 UTF8CollationIterator::~UTF8CollationIterator() {}
     32 
     33 void
     34 UTF8CollationIterator::resetToOffset(int32_t newOffset) {
     35    reset();
     36    pos = newOffset;
     37 }
     38 
     39 int32_t
     40 UTF8CollationIterator::getOffset() const {
     41    return pos;
     42 }
     43 
     44 uint32_t
     45 UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     46    if(pos == length) {
     47        c = U_SENTINEL;
     48        return Collation::FALLBACK_CE32;
     49    }
     50    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
     51    c = u8[pos++];
     52    if(U8_IS_SINGLE(c)) {
     53        // ASCII 00..7F
     54        return trie->data32[c];
     55    }
     56    uint8_t t1, t2;
     57    if(0xe0 <= c && c < 0xf0 &&
     58            ((pos + 1) < length || length < 0) &&
     59            U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
     60            (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
     61        // U+0800..U+FFFF except surrogates
     62        c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
     63        pos += 2;
     64        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     65    } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
     66        // U+0080..U+07FF
     67        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
     68        c = ((c & 0x1f) << 6) | t1;
     69        ++pos;
     70        return ce32;
     71    } else {
     72        // Function call for supplementary code points and error cases.
     73        // Illegal byte sequences yield U+FFFD.
     74        c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
     75        return data->getCE32(c);
     76    }
     77 }
     78 
     79 UBool
     80 UTF8CollationIterator::foundNULTerminator() {
     81    if(length < 0) {
     82        length = --pos;
     83        return true;
     84    } else {
     85        return false;
     86    }
     87 }
     88 
     89 UBool
     90 UTF8CollationIterator::forbidSurrogateCodePoints() const {
     91    return true;
     92 }
     93 
     94 UChar32
     95 UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     96    if(pos == length) {
     97        return U_SENTINEL;
     98    }
     99    if(u8[pos] == 0 && length < 0) {
    100        length = pos;
    101        return U_SENTINEL;
    102    }
    103    UChar32 c;
    104    U8_NEXT_OR_FFFD(u8, pos, length, c);
    105    return c;
    106 }
    107 
    108 UChar32
    109 UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    110    if(pos == 0) {
    111        return U_SENTINEL;
    112    }
    113    UChar32 c;
    114    U8_PREV_OR_FFFD(u8, 0, pos, c);
    115    return c;
    116 }
    117 
    118 void
    119 UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    120    U8_FWD_N(u8, pos, length, num);
    121 }
    122 
    123 void
    124 UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    125    U8_BACK_N(u8, 0, pos, num);
    126 }
    127 
    128 // FCDUTF8CollationIterator ------------------------------------------------ ***
    129 
    130 FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
    131 
    132 void
    133 FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
    134    reset();
    135    start = pos = newOffset;
    136    state = CHECK_FWD;
    137 }
    138 
    139 int32_t
    140 FCDUTF8CollationIterator::getOffset() const {
    141    if(state != IN_NORMALIZED) {
    142        return pos;
    143    } else if(pos == 0) {
    144        return start;
    145    } else {
    146        return limit;
    147    }
    148 }
    149 
    150 uint32_t
    151 FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    152    for(;;) {
    153        if(state == CHECK_FWD) {
    154            // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
    155            if(pos == length) {
    156                c = U_SENTINEL;
    157                return Collation::FALLBACK_CE32;
    158            }
    159            c = u8[pos++];
    160            if(U8_IS_SINGLE(c)) {
    161                // ASCII 00..7F
    162                return trie->data32[c];
    163            }
    164            uint8_t t1, t2;
    165            if(0xe0 <= c && c < 0xf0 &&
    166                    ((pos + 1) < length || length < 0) &&
    167                    U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
    168                    (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
    169                // U+0800..U+FFFF except surrogates
    170                c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
    171                pos += 2;
    172                if(CollationFCD::hasTccc(c) &&
    173                        (CollationFCD::maybeTibetanCompositeVowel(c) ||
    174                            (pos != length && nextHasLccc()))) {
    175                    pos -= 3;
    176                } else {
    177                    break;  // return CE32(BMP)
    178                }
    179            } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
    180                // U+0080..U+07FF
    181                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
    182                c = ((c & 0x1f) << 6) | t1;
    183                ++pos;
    184                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
    185                    pos -= 2;
    186                } else {
    187                    return ce32;
    188                }
    189            } else {
    190                // Function call for supplementary code points and error cases.
    191                // Illegal byte sequences yield U+FFFD.
    192                c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
    193                if(c == 0xfffd) {
    194                    return Collation::FFFD_CE32;
    195                } else {
    196                    U_ASSERT(c > 0xffff);
    197                    if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
    198                        pos -= 4;
    199                    } else {
    200                        return data->getCE32FromSupplementary(c);
    201                    }
    202                }
    203            }
    204            if(!nextSegment(errorCode)) {
    205                c = U_SENTINEL;
    206                return Collation::FALLBACK_CE32;
    207            }
    208            continue;
    209        } else if(state == IN_FCD_SEGMENT && pos != limit) {
    210            return UTF8CollationIterator::handleNextCE32(c, errorCode);
    211        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
    212            c = normalized[pos++];
    213            break;
    214        } else {
    215            switchToForward();
    216        }
    217    }
    218    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    219 }
    220 
    221 UBool
    222 FCDUTF8CollationIterator::nextHasLccc() const {
    223    U_ASSERT(state == CHECK_FWD && pos != length);
    224    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
    225    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
    226    UChar32 c = u8[pos];
    227    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return false; }
    228    int32_t i = pos;
    229    U8_NEXT_OR_FFFD(u8, i, length, c);
    230    if(c > 0xffff) { c = U16_LEAD(c); }
    231    return CollationFCD::hasLccc(c);
    232 }
    233 
    234 UBool
    235 FCDUTF8CollationIterator::previousHasTccc() const {
    236    U_ASSERT(state == CHECK_BWD && pos != 0);
    237    UChar32 c = u8[pos - 1];
    238    if(U8_IS_SINGLE(c)) { return false; }
    239    int32_t i = pos;
    240    U8_PREV_OR_FFFD(u8, 0, i, c);
    241    if(c > 0xffff) { c = U16_LEAD(c); }
    242    return CollationFCD::hasTccc(c);
    243 }
    244 
    245 char16_t
    246 FCDUTF8CollationIterator::handleGetTrailSurrogate() {
    247    if(state != IN_NORMALIZED) { return 0; }
    248    U_ASSERT(pos < normalized.length());
    249    char16_t trail;
    250    if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
    251    return trail;
    252 }
    253 
    254 UBool
    255 FCDUTF8CollationIterator::foundNULTerminator() {
    256    if(state == CHECK_FWD && length < 0) {
    257        length = --pos;
    258        return true;
    259    } else {
    260        return false;
    261    }
    262 }
    263 
    264 UChar32
    265 FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    266    UChar32 c;
    267    for(;;) {
    268        if(state == CHECK_FWD) {
    269            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
    270                return U_SENTINEL;
    271            }
    272            if(U8_IS_SINGLE(c)) {
    273                ++pos;
    274                return c;
    275            }
    276            U8_NEXT_OR_FFFD(u8, pos, length, c);
    277            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
    278                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
    279                        (pos != length && nextHasLccc()))) {
    280                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
    281                // and we can use U8_LENGTH() rather than a previous-position variable.
    282                pos -= U8_LENGTH(c);
    283                if(!nextSegment(errorCode)) {
    284                    return U_SENTINEL;
    285                }
    286                continue;
    287            }
    288            return c;
    289        } else if(state == IN_FCD_SEGMENT && pos != limit) {
    290            U8_NEXT_OR_FFFD(u8, pos, length, c);
    291            return c;
    292        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
    293            c = normalized.char32At(pos);
    294            pos += U16_LENGTH(c);
    295            return c;
    296        } else {
    297            switchToForward();
    298        }
    299    }
    300 }
    301 
    302 UChar32
    303 FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    304    UChar32 c;
    305    for(;;) {
    306        if(state == CHECK_BWD) {
    307            if(pos == 0) {
    308                return U_SENTINEL;
    309            }
    310            if(U8_IS_SINGLE(c = u8[pos - 1])) {
    311                --pos;
    312                return c;
    313            }
    314            U8_PREV_OR_FFFD(u8, 0, pos, c);
    315            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
    316                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
    317                        (pos != 0 && previousHasTccc()))) {
    318                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
    319                // and we can use U8_LENGTH() rather than a previous-position variable.
    320                pos += U8_LENGTH(c);
    321                if(!previousSegment(errorCode)) {
    322                    return U_SENTINEL;
    323                }
    324                continue;
    325            }
    326            return c;
    327        } else if(state == IN_FCD_SEGMENT && pos != start) {
    328            U8_PREV_OR_FFFD(u8, 0, pos, c);
    329            return c;
    330        } else if(state >= IN_NORMALIZED && pos != 0) {
    331            c = normalized.char32At(pos - 1);
    332            pos -= U16_LENGTH(c);
    333            return c;
    334        } else {
    335            switchToBackward();
    336        }
    337    }
    338 }
    339 
    340 void
    341 FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    342    // Specify the class to avoid a virtual-function indirection.
    343    // In Java, we would declare this class final.
    344    while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
    345        --num;
    346    }
    347 }
    348 
    349 void
    350 FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    351    // Specify the class to avoid a virtual-function indirection.
    352    // In Java, we would declare this class final.
    353    while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
    354        --num;
    355    }
    356 }
    357 
    358 void
    359 FCDUTF8CollationIterator::switchToForward() {
    360    U_ASSERT(state == CHECK_BWD ||
    361             (state == IN_FCD_SEGMENT && pos == limit) ||
    362             (state == IN_NORMALIZED && pos == normalized.length()));
    363    if(state == CHECK_BWD) {
    364        // Turn around from backward checking.
    365        start = pos;
    366        if(pos == limit) {
    367            state = CHECK_FWD;  // Check forward.
    368        } else {  // pos < limit
    369            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
    370        }
    371    } else {
    372        // Reached the end of the FCD segment.
    373        if(state == IN_FCD_SEGMENT) {
    374            // The input text segment is FCD, extend it forward.
    375        } else {
    376            // The input text segment needed to be normalized.
    377            // Switch to checking forward from it.
    378            start = pos = limit;
    379        }
    380        state = CHECK_FWD;
    381    }
    382 }
    383 
    384 UBool
    385 FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
    386    if(U_FAILURE(errorCode)) { return false; }
    387    U_ASSERT(state == CHECK_FWD && pos != length);
    388    // The input text [start..pos[ passes the FCD check.
    389    int32_t segmentStart = pos;
    390    // Collect the characters being checked, in case they need to be normalized.
    391    UnicodeString s;
    392    uint8_t prevCC = 0;
    393    for(;;) {
    394        // Fetch the next character and its fcd16 value.
    395        int32_t cpStart = pos;
    396        UChar32 c;
    397        U8_NEXT_OR_FFFD(u8, pos, length, c);
    398        uint16_t fcd16 = nfcImpl.getFCD16(c);
    399        uint8_t leadCC = static_cast<uint8_t>(fcd16 >> 8);
    400        if(leadCC == 0 && cpStart != segmentStart) {
    401            // FCD boundary before this character.
    402            pos = cpStart;
    403            break;
    404        }
    405        s.append(c);
    406        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    407            // Fails FCD check. Find the next FCD boundary and normalize.
    408            while(pos != length) {
    409                cpStart = pos;
    410                U8_NEXT_OR_FFFD(u8, pos, length, c);
    411                if(nfcImpl.getFCD16(c) <= 0xff) {
    412                    pos = cpStart;
    413                    break;
    414                }
    415                s.append(c);
    416            }
    417            if(!normalize(s, errorCode)) { return false; }
    418            start = segmentStart;
    419            limit = pos;
    420            state = IN_NORMALIZED;
    421            pos = 0;
    422            return true;
    423        }
    424        prevCC = static_cast<uint8_t>(fcd16);
    425        if(pos == length || prevCC == 0) {
    426            // FCD boundary after the last character.
    427            break;
    428        }
    429    }
    430    limit = pos;
    431    pos = segmentStart;
    432    U_ASSERT(pos != limit);
    433    state = IN_FCD_SEGMENT;
    434    return true;
    435 }
    436 
    437 void
    438 FCDUTF8CollationIterator::switchToBackward() {
    439    U_ASSERT(state == CHECK_FWD ||
    440             (state == IN_FCD_SEGMENT && pos == start) ||
    441             (state >= IN_NORMALIZED && pos == 0));
    442    if(state == CHECK_FWD) {
    443        // Turn around from forward checking.
    444        limit = pos;
    445        if(pos == start) {
    446            state = CHECK_BWD;  // Check backward.
    447        } else {  // pos > start
    448            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
    449        }
    450    } else {
    451        // Reached the start of the FCD segment.
    452        if(state == IN_FCD_SEGMENT) {
    453            // The input text segment is FCD, extend it backward.
    454        } else {
    455            // The input text segment needed to be normalized.
    456            // Switch to checking backward from it.
    457            limit = pos = start;
    458        }
    459        state = CHECK_BWD;
    460    }
    461 }
    462 
    463 UBool
    464 FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
    465    if(U_FAILURE(errorCode)) { return false; }
    466    U_ASSERT(state == CHECK_BWD && pos != 0);
    467    // The input text [pos..limit[ passes the FCD check.
    468    int32_t segmentLimit = pos;
    469    // Collect the characters being checked, in case they need to be normalized.
    470    UnicodeString s;
    471    uint8_t nextCC = 0;
    472    for(;;) {
    473        // Fetch the previous character and its fcd16 value.
    474        int32_t cpLimit = pos;
    475        UChar32 c;
    476        U8_PREV_OR_FFFD(u8, 0, pos, c);
    477        uint16_t fcd16 = nfcImpl.getFCD16(c);
    478        uint8_t trailCC = static_cast<uint8_t>(fcd16);
    479        if(trailCC == 0 && cpLimit != segmentLimit) {
    480            // FCD boundary after this character.
    481            pos = cpLimit;
    482            break;
    483        }
    484        s.append(c);
    485        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    486                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    487            // Fails FCD check. Find the previous FCD boundary and normalize.
    488            while(fcd16 > 0xff && pos != 0) {
    489                cpLimit = pos;
    490                U8_PREV_OR_FFFD(u8, 0, pos, c);
    491                fcd16 = nfcImpl.getFCD16(c);
    492                if(fcd16 == 0) {
    493                    pos = cpLimit;
    494                    break;
    495                }
    496                s.append(c);
    497            }
    498            s.reverse();
    499            if(!normalize(s, errorCode)) { return false; }
    500            limit = segmentLimit;
    501            start = pos;
    502            state = IN_NORMALIZED;
    503            pos = normalized.length();
    504            return true;
    505        }
    506        nextCC = static_cast<uint8_t>(fcd16 >> 8);
    507        if(pos == 0 || nextCC == 0) {
    508            // FCD boundary before the following character.
    509            break;
    510        }
    511    }
    512    start = pos;
    513    pos = segmentLimit;
    514    U_ASSERT(pos != start);
    515    state = IN_FCD_SEGMENT;
    516    return true;
    517 }
    518 
    519 UBool
    520 FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
    521    // NFD without argument checking.
    522    U_ASSERT(U_SUCCESS(errorCode));
    523    nfcImpl.decompose(s, normalized, errorCode);
    524    return U_SUCCESS(errorCode);
    525 }
    526 
    527 U_NAMESPACE_END
    528 
    529 #endif  // !UCONFIG_NO_COLLATION