tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uiter.cpp (32514B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  uiter.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002jan18
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/ustring.h"
     21 #include "unicode/chariter.h"
     22 #include "unicode/rep.h"
     23 #include "unicode/uiter.h"
     24 #include "unicode/utf.h"
     25 #include "unicode/utf8.h"
     26 #include "unicode/utf16.h"
     27 #include "cstring.h"
     28 
     29 U_NAMESPACE_USE
     30 
     31 #define IS_EVEN(n) (((n)&1)==0)
     32 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
     33 
     34 U_CDECL_BEGIN
     35 
     36 /* No-Op UCharIterator implementation for illegal input --------------------- */
     37 
     38 static int32_t U_CALLCONV
     39 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
     40    return 0;
     41 }
     42 
     43 static int32_t U_CALLCONV
     44 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
     45    return 0;
     46 }
     47 
     48 static UBool U_CALLCONV
     49 noopHasNext(UCharIterator * /*iter*/) {
     50    return false;
     51 }
     52 
     53 static UChar32 U_CALLCONV
     54 noopCurrent(UCharIterator * /*iter*/) {
     55    return U_SENTINEL;
     56 }
     57 
     58 static uint32_t U_CALLCONV
     59 noopGetState(const UCharIterator * /*iter*/) {
     60    return UITER_NO_STATE;
     61 }
     62 
     63 static void U_CALLCONV
     64 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
     65    *pErrorCode=U_UNSUPPORTED_ERROR;
     66 }
     67 
     68 static const UCharIterator noopIterator={
     69    nullptr, 0, 0, 0, 0, 0,
     70    noopGetIndex,
     71    noopMove,
     72    noopHasNext,
     73    noopHasNext,
     74    noopCurrent,
     75    noopCurrent,
     76    noopCurrent,
     77    nullptr,
     78    noopGetState,
     79    noopSetState
     80 };
     81 
     82 /* UCharIterator implementation for simple strings -------------------------- */
     83 
     84 /*
     85 * This is an implementation of a code unit (char16_t) iterator
     86 * for char16_t * strings.
     87 *
     88 * The UCharIterator.context field holds a pointer to the string.
     89 */
     90 
     91 static int32_t U_CALLCONV
     92 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
     93    switch(origin) {
     94    case UITER_ZERO:
     95        return 0;
     96    case UITER_START:
     97        return iter->start;
     98    case UITER_CURRENT:
     99        return iter->index;
    100    case UITER_LIMIT:
    101        return iter->limit;
    102    case UITER_LENGTH:
    103        return iter->length;
    104    default:
    105        /* not a valid origin */
    106        /* Should never get here! */
    107        return -1;
    108    }
    109 }
    110 
    111 static int32_t U_CALLCONV
    112 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
    113    int32_t pos;
    114 
    115    switch(origin) {
    116    case UITER_ZERO:
    117        pos=delta;
    118        break;
    119    case UITER_START:
    120        pos=iter->start+delta;
    121        break;
    122    case UITER_CURRENT:
    123        pos=iter->index+delta;
    124        break;
    125    case UITER_LIMIT:
    126        pos=iter->limit+delta;
    127        break;
    128    case UITER_LENGTH:
    129        pos=iter->length+delta;
    130        break;
    131    default:
    132        return -1;  /* Error */
    133    }
    134 
    135    if(pos<iter->start) {
    136        pos=iter->start;
    137    } else if(pos>iter->limit) {
    138        pos=iter->limit;
    139    }
    140 
    141    return iter->index=pos;
    142 }
    143 
    144 static UBool U_CALLCONV
    145 stringIteratorHasNext(UCharIterator *iter) {
    146    return iter->index<iter->limit;
    147 }
    148 
    149 static UBool U_CALLCONV
    150 stringIteratorHasPrevious(UCharIterator *iter) {
    151    return iter->index>iter->start;
    152 }
    153 
    154 static UChar32 U_CALLCONV
    155 stringIteratorCurrent(UCharIterator *iter) {
    156    if(iter->index<iter->limit) {
    157        return ((const char16_t *)(iter->context))[iter->index];
    158    } else {
    159        return U_SENTINEL;
    160    }
    161 }
    162 
    163 static UChar32 U_CALLCONV
    164 stringIteratorNext(UCharIterator *iter) {
    165    if(iter->index<iter->limit) {
    166        return ((const char16_t *)(iter->context))[iter->index++];
    167    } else {
    168        return U_SENTINEL;
    169    }
    170 }
    171 
    172 static UChar32 U_CALLCONV
    173 stringIteratorPrevious(UCharIterator *iter) {
    174    if(iter->index>iter->start) {
    175        return ((const char16_t *)(iter->context))[--iter->index];
    176    } else {
    177        return U_SENTINEL;
    178    }
    179 }
    180 
    181 static uint32_t U_CALLCONV
    182 stringIteratorGetState(const UCharIterator *iter) {
    183    return (uint32_t)iter->index;
    184 }
    185 
    186 static void U_CALLCONV
    187 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    188    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    189        /* do nothing */
    190    } else if(iter==nullptr) {
    191        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    192    } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
    193        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    194    } else {
    195        iter->index=(int32_t)state;
    196    }
    197 }
    198 
    199 static const UCharIterator stringIterator={
    200    nullptr, 0, 0, 0, 0, 0,
    201    stringIteratorGetIndex,
    202    stringIteratorMove,
    203    stringIteratorHasNext,
    204    stringIteratorHasPrevious,
    205    stringIteratorCurrent,
    206    stringIteratorNext,
    207    stringIteratorPrevious,
    208    nullptr,
    209    stringIteratorGetState,
    210    stringIteratorSetState
    211 };
    212 
    213 U_CAPI void U_EXPORT2
    214 uiter_setString(UCharIterator *iter, const char16_t *s, int32_t length) {
    215    if (iter != nullptr) {
    216        if (s != nullptr && length >= -1) {
    217            *iter=stringIterator;
    218            iter->context=s;
    219            if(length>=0) {
    220                iter->length=length;
    221            } else {
    222                iter->length=u_strlen(s);
    223            }
    224            iter->limit=iter->length;
    225        } else {
    226            *iter=noopIterator;
    227        }
    228    }
    229 }
    230 
    231 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
    232 
    233 /*
    234 * This is an implementation of a code unit (char16_t) iterator
    235 * for UTF-16BE strings, i.e., strings in byte-vectors where
    236 * each char16_t is stored as a big-endian pair of bytes.
    237 *
    238 * The UCharIterator.context field holds a pointer to the string.
    239 * Everything works just like with a normal char16_t iterator (uiter_setString),
    240 * except that UChars are assembled from byte pairs.
    241 */
    242 
    243 /* internal helper function */
    244 static inline UChar32
    245 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
    246    const uint8_t *p=(const uint8_t *)iter->context;
    247    return ((char16_t)p[2*index]<<8)|(char16_t)p[2*index+1];
    248 }
    249 
    250 static UChar32 U_CALLCONV
    251 utf16BEIteratorCurrent(UCharIterator *iter) {
    252    int32_t index;
    253 
    254    if((index=iter->index)<iter->limit) {
    255        return utf16BEIteratorGet(iter, index);
    256    } else {
    257        return U_SENTINEL;
    258    }
    259 }
    260 
    261 static UChar32 U_CALLCONV
    262 utf16BEIteratorNext(UCharIterator *iter) {
    263    int32_t index;
    264 
    265    if((index=iter->index)<iter->limit) {
    266        iter->index=index+1;
    267        return utf16BEIteratorGet(iter, index);
    268    } else {
    269        return U_SENTINEL;
    270    }
    271 }
    272 
    273 static UChar32 U_CALLCONV
    274 utf16BEIteratorPrevious(UCharIterator *iter) {
    275    int32_t index;
    276 
    277    if((index=iter->index)>iter->start) {
    278        iter->index=--index;
    279        return utf16BEIteratorGet(iter, index);
    280    } else {
    281        return U_SENTINEL;
    282    }
    283 }
    284 
    285 static const UCharIterator utf16BEIterator={
    286    nullptr, 0, 0, 0, 0, 0,
    287    stringIteratorGetIndex,
    288    stringIteratorMove,
    289    stringIteratorHasNext,
    290    stringIteratorHasPrevious,
    291    utf16BEIteratorCurrent,
    292    utf16BEIteratorNext,
    293    utf16BEIteratorPrevious,
    294    nullptr,
    295    stringIteratorGetState,
    296    stringIteratorSetState
    297 };
    298 
    299 /*
    300 * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL,
    301 * i.e., before a pair of 0 bytes where the first 0 byte is at an even
    302 * offset from s.
    303 */
    304 static int32_t
    305 utf16BE_strlen(const char *s) {
    306    if(IS_POINTER_EVEN(s)) {
    307        /*
    308         * even-aligned, call u_strlen(s)
    309         * we are probably on a little-endian machine, but searching for char16_t NUL
    310         * does not care about endianness
    311         */
    312        return u_strlen((const char16_t *)s);
    313    } else {
    314        /* odd-aligned, search for pair of 0 bytes */
    315        const char *p=s;
    316 
    317        while(!(*p==0 && p[1]==0)) {
    318            p+=2;
    319        }
    320        return (int32_t)((p-s)/2);
    321    }
    322 }
    323 
    324 U_CAPI void U_EXPORT2
    325 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
    326    if(iter!=nullptr) {
    327        /* allow only even-length strings (the input length counts bytes) */
    328        if(s!=nullptr && (length==-1 || (length>=0 && IS_EVEN(length)))) {
    329            /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
    330            length>>=1;
    331 
    332            if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
    333                /* big-endian machine and 2-aligned UTF-16BE string: use normal char16_t iterator */
    334                uiter_setString(iter, (const char16_t *)s, length);
    335                return;
    336            }
    337 
    338            *iter=utf16BEIterator;
    339            iter->context=s;
    340            if(length>=0) {
    341                iter->length=length;
    342            } else {
    343                iter->length=utf16BE_strlen(s);
    344            }
    345            iter->limit=iter->length;
    346        } else {
    347            *iter=noopIterator;
    348        }
    349    }
    350 }
    351 
    352 /* UCharIterator wrapper around CharacterIterator --------------------------- */
    353 
    354 /*
    355 * This is wrapper code around a C++ CharacterIterator to
    356 * look like a C UCharIterator.
    357 *
    358 * The UCharIterator.context field holds a pointer to the CharacterIterator.
    359 */
    360 
    361 static int32_t U_CALLCONV
    362 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
    363    switch(origin) {
    364    case UITER_ZERO:
    365        return 0;
    366    case UITER_START:
    367        return ((CharacterIterator *)(iter->context))->startIndex();
    368    case UITER_CURRENT:
    369        return ((CharacterIterator *)(iter->context))->getIndex();
    370    case UITER_LIMIT:
    371        return ((CharacterIterator *)(iter->context))->endIndex();
    372    case UITER_LENGTH:
    373        return ((CharacterIterator *)(iter->context))->getLength();
    374    default:
    375        /* not a valid origin */
    376        /* Should never get here! */
    377        return -1;
    378    }
    379 }
    380 
    381 static int32_t U_CALLCONV
    382 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
    383    switch(origin) {
    384    case UITER_ZERO:
    385        ((CharacterIterator *)(iter->context))->setIndex(delta);
    386        return ((CharacterIterator *)(iter->context))->getIndex();
    387    case UITER_START:
    388    case UITER_CURRENT:
    389    case UITER_LIMIT:
    390        return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
    391    case UITER_LENGTH:
    392        ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
    393        return ((CharacterIterator *)(iter->context))->getIndex();
    394    default:
    395        /* not a valid origin */
    396        /* Should never get here! */
    397        return -1;
    398    }
    399 }
    400 
    401 static UBool U_CALLCONV
    402 characterIteratorHasNext(UCharIterator *iter) {
    403    return ((CharacterIterator *)(iter->context))->hasNext();
    404 }
    405 
    406 static UBool U_CALLCONV
    407 characterIteratorHasPrevious(UCharIterator *iter) {
    408    return ((CharacterIterator *)(iter->context))->hasPrevious();
    409 }
    410 
    411 static UChar32 U_CALLCONV
    412 characterIteratorCurrent(UCharIterator *iter) {
    413    UChar32 c;
    414 
    415    c=((CharacterIterator *)(iter->context))->current();
    416    if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
    417        return c;
    418    } else {
    419        return U_SENTINEL;
    420    }
    421 }
    422 
    423 static UChar32 U_CALLCONV
    424 characterIteratorNext(UCharIterator *iter) {
    425    if(((CharacterIterator *)(iter->context))->hasNext()) {
    426        return ((CharacterIterator *)(iter->context))->nextPostInc();
    427    } else {
    428        return U_SENTINEL;
    429    }
    430 }
    431 
    432 static UChar32 U_CALLCONV
    433 characterIteratorPrevious(UCharIterator *iter) {
    434    if(((CharacterIterator *)(iter->context))->hasPrevious()) {
    435        return ((CharacterIterator *)(iter->context))->previous();
    436    } else {
    437        return U_SENTINEL;
    438    }
    439 }
    440 
    441 static uint32_t U_CALLCONV
    442 characterIteratorGetState(const UCharIterator *iter) {
    443    return ((CharacterIterator *)(iter->context))->getIndex();
    444 }
    445 
    446 static void U_CALLCONV
    447 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
    448    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    449        /* do nothing */
    450    } else if(iter==nullptr || iter->context==nullptr) {
    451        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    452    } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
    453        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    454    } else {
    455        ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
    456    }
    457 }
    458 
    459 static const UCharIterator characterIteratorWrapper={
    460    nullptr, 0, 0, 0, 0, 0,
    461    characterIteratorGetIndex,
    462    characterIteratorMove,
    463    characterIteratorHasNext,
    464    characterIteratorHasPrevious,
    465    characterIteratorCurrent,
    466    characterIteratorNext,
    467    characterIteratorPrevious,
    468    nullptr,
    469    characterIteratorGetState,
    470    characterIteratorSetState
    471 };
    472 
    473 U_CAPI void U_EXPORT2
    474 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
    475    if (iter != nullptr) {
    476        if (charIter != nullptr) {
    477            *iter=characterIteratorWrapper;
    478            iter->context=charIter;
    479        } else {
    480            *iter=noopIterator;
    481        }
    482    }
    483 }
    484 
    485 /* UCharIterator wrapper around Replaceable --------------------------------- */
    486 
    487 /*
    488 * This is an implementation of a code unit (char16_t) iterator
    489 * based on a Replaceable object.
    490 *
    491 * The UCharIterator.context field holds a pointer to the Replaceable.
    492 * UCharIterator.length and UCharIterator.index hold Replaceable.length()
    493 * and the iteration index.
    494 */
    495 
    496 static UChar32 U_CALLCONV
    497 replaceableIteratorCurrent(UCharIterator *iter) {
    498    if(iter->index<iter->limit) {
    499        return ((Replaceable *)(iter->context))->charAt(iter->index);
    500    } else {
    501        return U_SENTINEL;
    502    }
    503 }
    504 
    505 static UChar32 U_CALLCONV
    506 replaceableIteratorNext(UCharIterator *iter) {
    507    if(iter->index<iter->limit) {
    508        return ((Replaceable *)(iter->context))->charAt(iter->index++);
    509    } else {
    510        return U_SENTINEL;
    511    }
    512 }
    513 
    514 static UChar32 U_CALLCONV
    515 replaceableIteratorPrevious(UCharIterator *iter) {
    516    if(iter->index>iter->start) {
    517        return ((Replaceable *)(iter->context))->charAt(--iter->index);
    518    } else {
    519        return U_SENTINEL;
    520    }
    521 }
    522 
    523 static const UCharIterator replaceableIterator={
    524    nullptr, 0, 0, 0, 0, 0,
    525    stringIteratorGetIndex,
    526    stringIteratorMove,
    527    stringIteratorHasNext,
    528    stringIteratorHasPrevious,
    529    replaceableIteratorCurrent,
    530    replaceableIteratorNext,
    531    replaceableIteratorPrevious,
    532    nullptr,
    533    stringIteratorGetState,
    534    stringIteratorSetState
    535 };
    536 
    537 U_CAPI void U_EXPORT2
    538 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
    539    if (iter != nullptr) {
    540        if (rep != nullptr) {
    541            *iter=replaceableIterator;
    542            iter->context=rep;
    543            iter->limit=iter->length=rep->length();
    544        } else {
    545            *iter=noopIterator;
    546        }
    547    }
    548 }
    549 
    550 /* UCharIterator implementation for UTF-8 strings --------------------------- */
    551 
    552 /*
    553 * Possible, probably necessary only for an implementation for arbitrary
    554 * converters:
    555 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
    556 * This would require to turn reservedFn into a close function and
    557 * to introduce a uiter_close(iter).
    558 */
    559 
    560 #define UITER_CNV_CAPACITY 16
    561 
    562 /*
    563 * Minimal implementation:
    564 * Maintain a single-char16_t buffer for an additional surrogate.
    565 * The caller must not modify start and limit because they are used internally.
    566 *
    567 * Use UCharIterator fields as follows:
    568 *   context        pointer to UTF-8 string
    569 *   length         UTF-16 length of the string; -1 until lazy evaluation
    570 *   start          current UTF-8 index
    571 *   index          current UTF-16 index; may be -1="unknown" after setState()
    572 *   limit          UTF-8 length of the string
    573 *   reservedField  supplementary code point
    574 *
    575 * Since UCharIterator delivers 16-bit code units, the iteration can be
    576 * currently in the middle of the byte sequence for a supplementary code point.
    577 * In this case, reservedField will contain that code point and start will
    578 * point to after the corresponding byte sequence. The UTF-16 index will be
    579 * one less than what it would otherwise be corresponding to the UTF-8 index.
    580 * Otherwise, reservedField will be 0.
    581 */
    582 
    583 /*
    584 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
    585 * Add implementations that do not call strlen() for iteration but check for NUL.
    586 */
    587 
    588 static int32_t U_CALLCONV
    589 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
    590    switch(origin) {
    591    case UITER_ZERO:
    592    case UITER_START:
    593        return 0;
    594    case UITER_CURRENT:
    595        if(iter->index<0) {
    596            /* the current UTF-16 index is unknown after setState(), count from the beginning */
    597            const uint8_t *s;
    598            UChar32 c;
    599            int32_t i, limit, index;
    600 
    601            s=(const uint8_t *)iter->context;
    602            i=index=0;
    603            limit=iter->start; /* count up to the UTF-8 index */
    604            while(i<limit) {
    605                U8_NEXT_OR_FFFD(s, i, limit, c);
    606                index+=U16_LENGTH(c);
    607            }
    608 
    609            iter->start=i; /* just in case setState() did not get us to a code point boundary */
    610            if(i==iter->limit) {
    611                iter->length=index; /* in case it was <0 or wrong */
    612            }
    613            if(iter->reservedField!=0) {
    614                --index; /* we are in the middle of a supplementary code point */
    615            }
    616            iter->index=index;
    617        }
    618        return iter->index;
    619    case UITER_LIMIT:
    620    case UITER_LENGTH:
    621        if(iter->length<0) {
    622            const uint8_t *s;
    623            UChar32 c;
    624            int32_t i, limit, length;
    625 
    626            s=(const uint8_t *)iter->context;
    627            if(iter->index<0) {
    628                /*
    629                 * the current UTF-16 index is unknown after setState(),
    630                 * we must first count from the beginning to here
    631                 */
    632                i=length=0;
    633                limit=iter->start;
    634 
    635                /* count from the beginning to the current index */
    636                while(i<limit) {
    637                    U8_NEXT_OR_FFFD(s, i, limit, c);
    638                    length+=U16_LENGTH(c);
    639                }
    640 
    641                /* assume i==limit==iter->start, set the UTF-16 index */
    642                iter->start=i; /* just in case setState() did not get us to a code point boundary */
    643                iter->index= iter->reservedField!=0 ? length-1 : length;
    644            } else {
    645                i=iter->start;
    646                length=iter->index;
    647                if(iter->reservedField!=0) {
    648                    ++length;
    649                }
    650            }
    651 
    652            /* count from the current index to the end */
    653            limit=iter->limit;
    654            while(i<limit) {
    655                U8_NEXT_OR_FFFD(s, i, limit, c);
    656                length+=U16_LENGTH(c);
    657            }
    658            iter->length=length;
    659        }
    660        return iter->length;
    661    default:
    662        /* not a valid origin */
    663        /* Should never get here! */
    664        return -1;
    665    }
    666 }
    667 
    668 static int32_t U_CALLCONV
    669 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
    670    const uint8_t *s;
    671    UChar32 c;
    672    int32_t pos; /* requested UTF-16 index */
    673    int32_t i; /* UTF-8 index */
    674    UBool havePos;
    675 
    676    /* calculate the requested UTF-16 index */
    677    switch(origin) {
    678    case UITER_ZERO:
    679    case UITER_START:
    680        pos=delta;
    681        havePos=true;
    682        /* iter->index<0 (unknown) is possible */
    683        break;
    684    case UITER_CURRENT:
    685        if(iter->index>=0) {
    686            pos=iter->index+delta;
    687            havePos=true;
    688        } else {
    689            /* the current UTF-16 index is unknown after setState(), use only delta */
    690            pos=0;
    691            havePos=false;
    692        }
    693        break;
    694    case UITER_LIMIT:
    695    case UITER_LENGTH:
    696        if(iter->length>=0) {
    697            pos=iter->length+delta;
    698            havePos=true;
    699        } else {
    700            /* pin to the end, avoid counting the length */
    701            iter->index=-1;
    702            iter->start=iter->limit;
    703            iter->reservedField=0;
    704            if(delta>=0) {
    705                return UITER_UNKNOWN_INDEX;
    706            } else {
    707                /* the current UTF-16 index is unknown, use only delta */
    708                pos=0;
    709                havePos=false;
    710            }
    711        }
    712        break;
    713    default:
    714        return -1;  /* Error */
    715    }
    716 
    717    if(havePos) {
    718        /* shortcuts: pinning to the edges of the string */
    719        if(pos<=0) {
    720            iter->index=iter->start=iter->reservedField=0;
    721            return 0;
    722        } else if(iter->length>=0 && pos>=iter->length) {
    723            iter->index=iter->length;
    724            iter->start=iter->limit;
    725            iter->reservedField=0;
    726            return iter->index;
    727        }
    728 
    729        /* minimize the number of U8_NEXT/PREV operations */
    730        if(iter->index<0 || pos<iter->index/2) {
    731            /* go forward from the start instead of backward from the current index */
    732            iter->index=iter->start=iter->reservedField=0;
    733        } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
    734            /*
    735             * if we have the UTF-16 index and length and the new position is
    736             * closer to the end than the current index,
    737             * then go backward from the end instead of forward from the current index
    738             */
    739            iter->index=iter->length;
    740            iter->start=iter->limit;
    741            iter->reservedField=0;
    742        }
    743 
    744        delta=pos-iter->index;
    745        if(delta==0) {
    746            return iter->index; /* nothing to do */
    747        }
    748    } else {
    749        /* move relative to unknown UTF-16 index */
    750        if(delta==0) {
    751            return UITER_UNKNOWN_INDEX; /* nothing to do */
    752        } else if(-delta>=iter->start) {
    753            /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
    754            iter->index=iter->start=iter->reservedField=0;
    755            return 0;
    756        } else if(delta>=(iter->limit-iter->start)) {
    757            /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
    758            iter->index=iter->length; /* may or may not be <0 (unknown) */
    759            iter->start=iter->limit;
    760            iter->reservedField=0;
    761            return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
    762        }
    763    }
    764 
    765    /* delta!=0 */
    766 
    767    /* move towards the requested position, pin to the edges of the string */
    768    s=(const uint8_t *)iter->context;
    769    pos=iter->index; /* could be <0 (unknown) */
    770    i=iter->start;
    771    if(delta>0) {
    772        /* go forward */
    773        int32_t limit=iter->limit;
    774        if(iter->reservedField!=0) {
    775            iter->reservedField=0;
    776            ++pos;
    777            --delta;
    778        }
    779        while(delta>0 && i<limit) {
    780            U8_NEXT_OR_FFFD(s, i, limit, c);
    781            if(c<=0xffff) {
    782                ++pos;
    783                --delta;
    784            } else if(delta>=2) {
    785                pos+=2;
    786                delta-=2;
    787            } else /* delta==1 */ {
    788                /* stop in the middle of a supplementary code point */
    789                iter->reservedField=c;
    790                ++pos;
    791                break; /* delta=0; */
    792            }
    793        }
    794        if(i==limit) {
    795            if(iter->length<0 && iter->index>=0) {
    796                iter->length= iter->reservedField==0 ? pos : pos+1;
    797            } else if(iter->index<0 && iter->length>=0) {
    798                iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
    799            }
    800        }
    801    } else /* delta<0 */ {
    802        /* go backward */
    803        if(iter->reservedField!=0) {
    804            iter->reservedField=0;
    805            i-=4; /* we stayed behind the supplementary code point; go before it now */
    806            --pos;
    807            ++delta;
    808        }
    809        while(delta<0 && i>0) {
    810            U8_PREV_OR_FFFD(s, 0, i, c);
    811            if(c<=0xffff) {
    812                --pos;
    813                ++delta;
    814            } else if(delta<=-2) {
    815                pos-=2;
    816                delta+=2;
    817            } else /* delta==-1 */ {
    818                /* stop in the middle of a supplementary code point */
    819                i+=4; /* back to behind this supplementary code point for consistent state */
    820                iter->reservedField=c;
    821                --pos;
    822                break; /* delta=0; */
    823            }
    824        }
    825    }
    826 
    827    iter->start=i;
    828    if(iter->index>=0) {
    829        return iter->index=pos;
    830    } else {
    831        /* we started with index<0 (unknown) so pos is bogus */
    832        if(i<=1) {
    833            return iter->index=i; /* reached the beginning */
    834        } else {
    835            /* we still don't know the UTF-16 index */
    836            return UITER_UNKNOWN_INDEX;
    837        }
    838    }
    839 }
    840 
    841 static UBool U_CALLCONV
    842 utf8IteratorHasNext(UCharIterator *iter) {
    843    return iter->start<iter->limit || iter->reservedField!=0;
    844 }
    845 
    846 static UBool U_CALLCONV
    847 utf8IteratorHasPrevious(UCharIterator *iter) {
    848    return iter->start>0;
    849 }
    850 
    851 static UChar32 U_CALLCONV
    852 utf8IteratorCurrent(UCharIterator *iter) {
    853    if(iter->reservedField!=0) {
    854        return U16_TRAIL(iter->reservedField);
    855    } else if(iter->start<iter->limit) {
    856        const uint8_t *s=(const uint8_t *)iter->context;
    857        UChar32 c;
    858        int32_t i=iter->start;
    859 
    860        U8_NEXT_OR_FFFD(s, i, iter->limit, c);
    861        if(c<=0xffff) {
    862            return c;
    863        } else {
    864            return U16_LEAD(c);
    865        }
    866    } else {
    867        return U_SENTINEL;
    868    }
    869 }
    870 
    871 static UChar32 U_CALLCONV
    872 utf8IteratorNext(UCharIterator *iter) {
    873    int32_t index;
    874 
    875    if(iter->reservedField!=0) {
    876        char16_t trail=U16_TRAIL(iter->reservedField);
    877        iter->reservedField=0;
    878        if((index=iter->index)>=0) {
    879            iter->index=index+1;
    880        }
    881        return trail;
    882    } else if(iter->start<iter->limit) {
    883        const uint8_t *s=(const uint8_t *)iter->context;
    884        UChar32 c;
    885 
    886        U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
    887        if((index=iter->index)>=0) {
    888            iter->index=++index;
    889            if(iter->length<0 && iter->start==iter->limit) {
    890                iter->length= c<=0xffff ? index : index+1;
    891            }
    892        } else if(iter->start==iter->limit && iter->length>=0) {
    893            iter->index= c<=0xffff ? iter->length : iter->length-1;
    894        }
    895        if(c<=0xffff) {
    896            return c;
    897        } else {
    898            iter->reservedField=c;
    899            return U16_LEAD(c);
    900        }
    901    } else {
    902        return U_SENTINEL;
    903    }
    904 }
    905 
    906 static UChar32 U_CALLCONV
    907 utf8IteratorPrevious(UCharIterator *iter) {
    908    int32_t index;
    909 
    910    if(iter->reservedField!=0) {
    911        char16_t lead=U16_LEAD(iter->reservedField);
    912        iter->reservedField=0;
    913        iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
    914        if((index=iter->index)>0) {
    915            iter->index=index-1;
    916        }
    917        return lead;
    918    } else if(iter->start>0) {
    919        const uint8_t *s=(const uint8_t *)iter->context;
    920        UChar32 c;
    921 
    922        U8_PREV_OR_FFFD(s, 0, iter->start, c);
    923        if((index=iter->index)>0) {
    924            iter->index=index-1;
    925        } else if(iter->start<=1) {
    926            iter->index= c<=0xffff ? iter->start : iter->start+1;
    927        }
    928        if(c<=0xffff) {
    929            return c;
    930        } else {
    931            iter->start+=4; /* back to behind this supplementary code point for consistent state */
    932            iter->reservedField=c;
    933            return U16_TRAIL(c);
    934        }
    935    } else {
    936        return U_SENTINEL;
    937    }
    938 }
    939 
    940 static uint32_t U_CALLCONV
    941 utf8IteratorGetState(const UCharIterator *iter) {
    942    uint32_t state=(uint32_t)(iter->start<<1);
    943    if(iter->reservedField!=0) {
    944        state|=1;
    945    }
    946    return state;
    947 }
    948 
    949 static void U_CALLCONV
    950 utf8IteratorSetState(UCharIterator *iter,
    951                     uint32_t state,
    952                     UErrorCode *pErrorCode)
    953 {
    954    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    955        /* do nothing */
    956    } else if(iter==nullptr) {
    957        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    958    } else if(state==utf8IteratorGetState(iter)) {
    959        /* setting to the current state: no-op */
    960    } else {
    961        int32_t index=(int32_t)(state>>1); /* UTF-8 index */
    962        state&=1; /* 1 if in surrogate pair, must be index>=4 */
    963 
    964        if((state==0 ? index<0 : index<4) || iter->limit<index) {
    965            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    966        } else {
    967            iter->start=index; /* restore UTF-8 byte index */
    968            if(index<=1) {
    969                iter->index=index;
    970            } else {
    971                iter->index=-1; /* unknown UTF-16 index */
    972            }
    973            if(state==0) {
    974                iter->reservedField=0;
    975            } else {
    976                /* verified index>=4 above */
    977                UChar32 c;
    978                U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
    979                if(c<=0xffff) {
    980                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    981                } else {
    982                    iter->reservedField=c;
    983                }
    984            }
    985        }
    986    }
    987 }
    988 
    989 static const UCharIterator utf8Iterator={
    990    nullptr, 0, 0, 0, 0, 0,
    991    utf8IteratorGetIndex,
    992    utf8IteratorMove,
    993    utf8IteratorHasNext,
    994    utf8IteratorHasPrevious,
    995    utf8IteratorCurrent,
    996    utf8IteratorNext,
    997    utf8IteratorPrevious,
    998    nullptr,
    999    utf8IteratorGetState,
   1000    utf8IteratorSetState
   1001 };
   1002 
   1003 U_CAPI void U_EXPORT2
   1004 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
   1005    if (iter != nullptr) {
   1006        if (s != nullptr && length >= -1) {
   1007            *iter=utf8Iterator;
   1008            iter->context=s;
   1009            if(length>=0) {
   1010                iter->limit=length;
   1011            } else {
   1012                iter->limit=(int32_t)uprv_strlen(s);
   1013            }
   1014            iter->length= iter->limit<=1 ? iter->limit : -1;
   1015        } else {
   1016            *iter=noopIterator;
   1017        }
   1018    }
   1019 }
   1020 
   1021 /* Helper functions --------------------------------------------------------- */
   1022 
   1023 U_CAPI UChar32 U_EXPORT2
   1024 uiter_current32(UCharIterator *iter) {
   1025    UChar32 c, c2;
   1026 
   1027    c=iter->current(iter);
   1028    if(U16_IS_SURROGATE(c)) {
   1029        if(U16_IS_SURROGATE_LEAD(c)) {
   1030            /*
   1031             * go to the next code unit
   1032             * we know that we are not at the limit because c!=U_SENTINEL
   1033             */
   1034            iter->move(iter, 1, UITER_CURRENT);
   1035            if(U16_IS_TRAIL(c2=iter->current(iter))) {
   1036                c=U16_GET_SUPPLEMENTARY(c, c2);
   1037            }
   1038 
   1039            /* undo index movement */
   1040            iter->move(iter, -1, UITER_CURRENT);
   1041        } else {
   1042            if(U16_IS_LEAD(c2=iter->previous(iter))) {
   1043                c=U16_GET_SUPPLEMENTARY(c2, c);
   1044            }
   1045            if(c2>=0) {
   1046                /* undo index movement */
   1047                iter->move(iter, 1, UITER_CURRENT);
   1048            }
   1049        }
   1050    }
   1051    return c;
   1052 }
   1053 
   1054 U_CAPI UChar32 U_EXPORT2
   1055 uiter_next32(UCharIterator *iter) {
   1056    UChar32 c, c2;
   1057 
   1058    c=iter->next(iter);
   1059    if(U16_IS_LEAD(c)) {
   1060        if(U16_IS_TRAIL(c2=iter->next(iter))) {
   1061            c=U16_GET_SUPPLEMENTARY(c, c2);
   1062        } else if(c2>=0) {
   1063            /* unmatched first surrogate, undo index movement */
   1064            iter->move(iter, -1, UITER_CURRENT);
   1065        }
   1066    }
   1067    return c;
   1068 }
   1069 
   1070 U_CAPI UChar32 U_EXPORT2
   1071 uiter_previous32(UCharIterator *iter) {
   1072    UChar32 c, c2;
   1073 
   1074    c=iter->previous(iter);
   1075    if(U16_IS_TRAIL(c)) {
   1076        if(U16_IS_LEAD(c2=iter->previous(iter))) {
   1077            c=U16_GET_SUPPLEMENTARY(c2, c);
   1078        } else if(c2>=0) {
   1079            /* unmatched second surrogate, undo index movement */
   1080            iter->move(iter, 1, UITER_CURRENT);
   1081        }
   1082    }
   1083    return c;
   1084 }
   1085 
   1086 U_CAPI uint32_t U_EXPORT2
   1087 uiter_getState(const UCharIterator *iter) {
   1088    if(iter==nullptr || iter->getState==nullptr) {
   1089        return UITER_NO_STATE;
   1090    } else {
   1091        return iter->getState(iter);
   1092    }
   1093 }
   1094 
   1095 U_CAPI void U_EXPORT2
   1096 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
   1097    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1098        /* do nothing */
   1099    } else if(iter==nullptr) {
   1100        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1101    } else if(iter->setState==nullptr) {
   1102        *pErrorCode=U_UNSUPPORTED_ERROR;
   1103    } else {
   1104        iter->setState(iter, state, pErrorCode);
   1105    }
   1106 }
   1107 
   1108 U_CDECL_END