tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf16collationiterator.cpp (15766B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * utf16collationiterator.cpp
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "charstr.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "collationfcd.h"
     23 #include "collationiterator.h"
     24 #include "normalizer2impl.h"
     25 #include "uassert.h"
     26 #include "utf16collationiterator.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
     31                                               const char16_t *newText)
     32        : CollationIterator(other),
     33          start(newText),
     34          pos(newText + (other.pos - other.start)),
     35          limit(other.limit == nullptr ? nullptr : newText + (other.limit - other.start)) {
     36 }
     37 
     38 UTF16CollationIterator::~UTF16CollationIterator() {}
     39 
     40 bool
     41 UTF16CollationIterator::operator==(const CollationIterator &other) const {
     42    if(!CollationIterator::operator==(other)) { return false; }
     43    const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
     44    // Compare the iterator state but not the text: Assume that the caller does that.
     45    return (pos - start) == (o.pos - o.start);
     46 }
     47 
     48 void
     49 UTF16CollationIterator::resetToOffset(int32_t newOffset) {
     50    reset();
     51    pos = start + newOffset;
     52 }
     53 
     54 int32_t
     55 UTF16CollationIterator::getOffset() const {
     56    return static_cast<int32_t>(pos - start);
     57 }
     58 
     59 uint32_t
     60 UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
     61    if(pos == limit) {
     62        c = U_SENTINEL;
     63        return Collation::FALLBACK_CE32;
     64    }
     65    c = *pos++;
     66    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
     67 }
     68 
     69 char16_t
     70 UTF16CollationIterator::handleGetTrailSurrogate() {
     71    if(pos == limit) { return 0; }
     72    char16_t trail;
     73    if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
     74    return trail;
     75 }
     76 
     77 UBool
     78 UTF16CollationIterator::foundNULTerminator() {
     79    if(limit == nullptr) {
     80        limit = --pos;
     81        return true;
     82    } else {
     83        return false;
     84    }
     85 }
     86 
     87 UChar32
     88 UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
     89    if(pos == limit) {
     90        return U_SENTINEL;
     91    }
     92    UChar32 c = *pos;
     93    if(c == 0 && limit == nullptr) {
     94        limit = pos;
     95        return U_SENTINEL;
     96    }
     97    ++pos;
     98    char16_t trail;
     99    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
    100        ++pos;
    101        return U16_GET_SUPPLEMENTARY(c, trail);
    102    } else {
    103        return c;
    104    }
    105 }
    106 
    107 UChar32
    108 UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    109    if(pos == start) {
    110        return U_SENTINEL;
    111    }
    112    UChar32 c = *--pos;
    113    char16_t lead;
    114    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    115        --pos;
    116        return U16_GET_SUPPLEMENTARY(lead, c);
    117    } else {
    118        return c;
    119    }
    120 }
    121 
    122 void
    123 UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    124    while(num > 0 && pos != limit) {
    125        UChar32 c = *pos;
    126        if(c == 0 && limit == nullptr) {
    127            limit = pos;
    128            break;
    129        }
    130        ++pos;
    131        --num;
    132        if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
    133            ++pos;
    134        }
    135    }
    136 }
    137 
    138 void
    139 UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    140    while(num > 0 && pos != start) {
    141        UChar32 c = *--pos;
    142        --num;
    143        if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
    144            --pos;
    145        }
    146    }
    147 }
    148 
    149 // FCDUTF16CollationIterator ----------------------------------------------- ***
    150 
    151 FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
    152                                                     const char16_t *newText)
    153        : UTF16CollationIterator(other),
    154          rawStart(newText),
    155          segmentStart(newText + (other.segmentStart - other.rawStart)),
    156          segmentLimit(other.segmentLimit == nullptr ? nullptr : newText + (other.segmentLimit - other.rawStart)),
    157          rawLimit(other.rawLimit == nullptr ? nullptr : newText + (other.rawLimit - other.rawStart)),
    158          nfcImpl(other.nfcImpl),
    159          normalized(other.normalized),
    160          checkDir(other.checkDir) {
    161    if(checkDir != 0 || other.start == other.segmentStart) {
    162        start = newText + (other.start - other.rawStart);
    163        pos = newText + (other.pos - other.rawStart);
    164        limit = other.limit == nullptr ? nullptr : newText + (other.limit - other.rawStart);
    165    } else {
    166        start = normalized.getBuffer();
    167        pos = start + (other.pos - other.start);
    168        limit = start + normalized.length();
    169    }
    170 }
    171 
    172 FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
    173 
    174 bool
    175 FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
    176    // Skip the UTF16CollationIterator and call its parent.
    177    if(!CollationIterator::operator==(other)) { return false; }
    178    const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
    179    // Compare the iterator state but not the text: Assume that the caller does that.
    180    if(checkDir != o.checkDir) { return false; }
    181    if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return false; }
    182    if(checkDir != 0 || start == segmentStart) {
    183        return (pos - rawStart) == (o.pos - o.rawStart);
    184    } else {
    185        return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
    186                (pos - start) == (o.pos - o.start);
    187    }
    188 }
    189 
    190 void
    191 FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
    192    reset();
    193    start = segmentStart = pos = rawStart + newOffset;
    194    limit = rawLimit;
    195    checkDir = 1;
    196 }
    197 
    198 int32_t
    199 FCDUTF16CollationIterator::getOffset() const {
    200    if(checkDir != 0 || start == segmentStart) {
    201        return static_cast<int32_t>(pos - rawStart);
    202    } else if(pos == start) {
    203        return static_cast<int32_t>(segmentStart - rawStart);
    204    } else {
    205        return static_cast<int32_t>(segmentLimit - rawStart);
    206    }
    207 }
    208 
    209 uint32_t
    210 FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    211    for(;;) {
    212        if(checkDir > 0) {
    213            if(pos == limit) {
    214                c = U_SENTINEL;
    215                return Collation::FALLBACK_CE32;
    216            }
    217            c = *pos++;
    218            if(CollationFCD::hasTccc(c)) {
    219                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    220                        (pos != limit && CollationFCD::hasLccc(*pos))) {
    221                    --pos;
    222                    if(!nextSegment(errorCode)) {
    223                        c = U_SENTINEL;
    224                        return Collation::FALLBACK_CE32;
    225                    }
    226                    c = *pos++;
    227                }
    228            }
    229            break;
    230        } else if(checkDir == 0 && pos != limit) {
    231            c = *pos++;
    232            break;
    233        } else {
    234            switchToForward();
    235        }
    236    }
    237    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    238 }
    239 
    240 UBool
    241 FCDUTF16CollationIterator::foundNULTerminator() {
    242    if(limit == nullptr) {
    243        limit = rawLimit = --pos;
    244        return true;
    245    } else {
    246        return false;
    247    }
    248 }
    249 
    250 UChar32
    251 FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    252    UChar32 c;
    253    for(;;) {
    254        if(checkDir > 0) {
    255            if(pos == limit) {
    256                return U_SENTINEL;
    257            }
    258            c = *pos++;
    259            if(CollationFCD::hasTccc(c)) {
    260                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    261                        (pos != limit && CollationFCD::hasLccc(*pos))) {
    262                    --pos;
    263                    if(!nextSegment(errorCode)) {
    264                        return U_SENTINEL;
    265                    }
    266                    c = *pos++;
    267                }
    268            } else if(c == 0 && limit == nullptr) {
    269                limit = rawLimit = --pos;
    270                return U_SENTINEL;
    271            }
    272            break;
    273        } else if(checkDir == 0 && pos != limit) {
    274            c = *pos++;
    275            break;
    276        } else {
    277            switchToForward();
    278        }
    279    }
    280    char16_t trail;
    281    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
    282        ++pos;
    283        return U16_GET_SUPPLEMENTARY(c, trail);
    284    } else {
    285        return c;
    286    }
    287 }
    288 
    289 UChar32
    290 FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    291    UChar32 c;
    292    for(;;) {
    293        if(checkDir < 0) {
    294            if(pos == start) {
    295                return U_SENTINEL;
    296            }
    297            c = *--pos;
    298            if(CollationFCD::hasLccc(c)) {
    299                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
    300                        (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
    301                    ++pos;
    302                    if(!previousSegment(errorCode)) {
    303                        return U_SENTINEL;
    304                    }
    305                    c = *--pos;
    306                }
    307            }
    308            break;
    309        } else if(checkDir == 0 && pos != start) {
    310            c = *--pos;
    311            break;
    312        } else {
    313            switchToBackward();
    314        }
    315    }
    316    char16_t lead;
    317    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
    318        --pos;
    319        return U16_GET_SUPPLEMENTARY(lead, c);
    320    } else {
    321        return c;
    322    }
    323 }
    324 
    325 void
    326 FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    327    // Specify the class to avoid a virtual-function indirection.
    328    // In Java, we would declare this class final.
    329    while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
    330        --num;
    331    }
    332 }
    333 
    334 void
    335 FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    336    // Specify the class to avoid a virtual-function indirection.
    337    // In Java, we would declare this class final.
    338    while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
    339        --num;
    340    }
    341 }
    342 
    343 void
    344 FCDUTF16CollationIterator::switchToForward() {
    345    U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
    346    if(checkDir < 0) {
    347        // Turn around from backward checking.
    348        start = segmentStart = pos;
    349        if(pos == segmentLimit) {
    350            limit = rawLimit;
    351            checkDir = 1;  // Check forward.
    352        } else {  // pos < segmentLimit
    353            checkDir = 0;  // Stay in FCD segment.
    354        }
    355    } else {
    356        // Reached the end of the FCD segment.
    357        if(start == segmentStart) {
    358            // The input text segment is FCD, extend it forward.
    359        } else {
    360            // The input text segment needed to be normalized.
    361            // Switch to checking forward from it.
    362            pos = start = segmentStart = segmentLimit;
    363            // Note: If this segment is at the end of the input text,
    364            // then it might help to return false to indicate that, so that
    365            // we do not have to re-check and normalize when we turn around and go backwards.
    366            // However, that would complicate the call sites for an optimization of an unusual case.
    367        }
    368        limit = rawLimit;
    369        checkDir = 1;
    370    }
    371 }
    372 
    373 UBool
    374 FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
    375    if(U_FAILURE(errorCode)) { return false; }
    376    U_ASSERT(checkDir > 0 && pos != limit);
    377    // The input text [segmentStart..pos[ passes the FCD check.
    378    const char16_t *p = pos;
    379    uint8_t prevCC = 0;
    380    for(;;) {
    381        // Fetch the next character's fcd16 value.
    382        const char16_t *q = p;
    383        uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
    384        uint8_t leadCC = static_cast<uint8_t>(fcd16 >> 8);
    385        if(leadCC == 0 && q != pos) {
    386            // FCD boundary before the [q, p[ character.
    387            limit = segmentLimit = q;
    388            break;
    389        }
    390        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    391            // Fails FCD check. Find the next FCD boundary and normalize.
    392            do {
    393                q = p;
    394            } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
    395            if(!normalize(pos, q, errorCode)) { return false; }
    396            pos = start;
    397            break;
    398        }
    399        prevCC = static_cast<uint8_t>(fcd16);
    400        if(p == rawLimit || prevCC == 0) {
    401            // FCD boundary after the last character.
    402            limit = segmentLimit = p;
    403            break;
    404        }
    405    }
    406    U_ASSERT(pos != limit);
    407    checkDir = 0;
    408    return true;
    409 }
    410 
    411 void
    412 FCDUTF16CollationIterator::switchToBackward() {
    413    U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
    414    if(checkDir > 0) {
    415        // Turn around from forward checking.
    416        limit = segmentLimit = pos;
    417        if(pos == segmentStart) {
    418            start = rawStart;
    419            checkDir = -1;  // Check backward.
    420        } else {  // pos > segmentStart
    421            checkDir = 0;  // Stay in FCD segment.
    422        }
    423    } else {
    424        // Reached the start of the FCD segment.
    425        if(start == segmentStart) {
    426            // The input text segment is FCD, extend it backward.
    427        } else {
    428            // The input text segment needed to be normalized.
    429            // Switch to checking backward from it.
    430            pos = limit = segmentLimit = segmentStart;
    431        }
    432        start = rawStart;
    433        checkDir = -1;
    434    }
    435 }
    436 
    437 UBool
    438 FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
    439    if(U_FAILURE(errorCode)) { return false; }
    440    U_ASSERT(checkDir < 0 && pos != start);
    441    // The input text [pos..segmentLimit[ passes the FCD check.
    442    const char16_t *p = pos;
    443    uint8_t nextCC = 0;
    444    for(;;) {
    445        // Fetch the previous character's fcd16 value.
    446        const char16_t *q = p;
    447        uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
    448        uint8_t trailCC = static_cast<uint8_t>(fcd16);
    449        if(trailCC == 0 && q != pos) {
    450            // FCD boundary after the [p, q[ character.
    451            start = segmentStart = q;
    452            break;
    453        }
    454        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
    455                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
    456            // Fails FCD check. Find the previous FCD boundary and normalize.
    457            do {
    458                q = p;
    459            } while(fcd16 > 0xff && p != rawStart &&
    460                    (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
    461            if(!normalize(q, pos, errorCode)) { return false; }
    462            pos = limit;
    463            break;
    464        }
    465        nextCC = static_cast<uint8_t>(fcd16 >> 8);
    466        if(p == rawStart || nextCC == 0) {
    467            // FCD boundary before the following character.
    468            start = segmentStart = p;
    469            break;
    470        }
    471    }
    472    U_ASSERT(pos != start);
    473    checkDir = 0;
    474    return true;
    475 }
    476 
    477 UBool
    478 FCDUTF16CollationIterator::normalize(const char16_t *from, const char16_t *to, UErrorCode &errorCode) {
    479    // NFD without argument checking.
    480    U_ASSERT(U_SUCCESS(errorCode));
    481    nfcImpl.decompose(from, to, normalized, static_cast<int32_t>(to - from), errorCode);
    482    if(U_FAILURE(errorCode)) { return false; }
    483    // Switch collation processing into the FCD buffer
    484    // with the result of normalizing [segmentStart, segmentLimit[.
    485    segmentStart = from;
    486    segmentLimit = to;
    487    start = normalized.getBuffer();
    488    limit = start + normalized.length();
    489    return true;
    490 }
    491 
    492 U_NAMESPACE_END
    493 
    494 #endif  // !UCONFIG_NO_COLLATION