tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ustrtrns.cpp (50418B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *
     11 * File ustrtrns.cpp
     12 *
     13 * Modification History:
     14 *
     15 *   Date        Name        Description
     16 *   9/10/2001    Ram    Creation.
     17 ******************************************************************************
     18 */
     19 
     20 /*******************************************************************************
     21 *
     22 * u_strTo* and u_strFrom* APIs
     23 * WCS functions moved to ustr_wcs.c for better modularization
     24 *
     25 *******************************************************************************
     26 */
     27 
     28 
     29 #include "unicode/putil.h"
     30 #include "unicode/ustring.h"
     31 #include "unicode/utf.h"
     32 #include "unicode/utf8.h"
     33 #include "unicode/utf16.h"
     34 #include "cstring.h"
     35 #include "cmemory.h"
     36 #include "ustr_imp.h"
     37 #include "uassert.h"
     38 
     39 U_CAPI char16_t* U_EXPORT2
     40 u_strFromUTF32WithSub(char16_t *dest,
     41               int32_t destCapacity,
     42               int32_t *pDestLength,
     43               const UChar32 *src,
     44               int32_t srcLength,
     45               UChar32 subchar, int32_t *pNumSubstitutions,
     46               UErrorCode *pErrorCode) {
     47    const UChar32 *srcLimit;
     48    UChar32 ch;
     49    char16_t *destLimit;
     50    char16_t *pDest;
     51    int32_t reqLength;
     52    int32_t numSubstitutions;
     53 
     54    /* args check */
     55    if(U_FAILURE(*pErrorCode)){
     56        return nullptr;
     57    }
     58    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
     59        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
     60        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     61    ) {
     62        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     63        return nullptr;
     64    }
     65 
     66    if(pNumSubstitutions != nullptr) {
     67        *pNumSubstitutions = 0;
     68    }
     69 
     70    pDest = dest;
     71    destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
     72    reqLength = 0;
     73    numSubstitutions = 0;
     74 
     75    if(srcLength < 0) {
     76        /* simple loop for conversion of a NUL-terminated BMP string */
     77        while((ch=*src) != 0 &&
     78              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     79            ++src;
     80            if(pDest < destLimit) {
     81                *pDest++ = (char16_t)ch;
     82            } else {
     83                ++reqLength;
     84            }
     85        }
     86        srcLimit = src;
     87        if(ch != 0) {
     88            /* "complicated" case, find the end of the remaining string */
     89            while(*++srcLimit != 0) {}
     90        }
     91    } else {
     92      srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
     93    }
     94 
     95    /* convert with length */
     96    while(src < srcLimit) {
     97        ch = *src++;
     98        do {
     99            /* usually "loops" once; twice only for writing subchar */
    100            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
    101                if(pDest < destLimit) {
    102                    *pDest++ = (char16_t)ch;
    103                } else {
    104                    ++reqLength;
    105                }
    106                break;
    107            } else if(0x10000 <= ch && ch <= 0x10ffff) {
    108                if(pDest!=nullptr && ((pDest + 2) <= destLimit)) {
    109                    *pDest++ = U16_LEAD(ch);
    110                    *pDest++ = U16_TRAIL(ch);
    111                } else {
    112                    reqLength += 2;
    113                }
    114                break;
    115            } else if((ch = subchar) < 0) {
    116                /* surrogate code point, or not a Unicode code point at all */
    117                *pErrorCode = U_INVALID_CHAR_FOUND;
    118                return nullptr;
    119            } else {
    120                ++numSubstitutions;
    121            }
    122        } while(true);
    123    }
    124 
    125    reqLength += (int32_t)(pDest - dest);
    126    if(pDestLength) {
    127        *pDestLength = reqLength;
    128    }
    129    if(pNumSubstitutions != nullptr) {
    130        *pNumSubstitutions = numSubstitutions;
    131    }
    132 
    133    /* Terminate the buffer */
    134    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    135    
    136    return dest;
    137 }
    138 
    139 U_CAPI char16_t* U_EXPORT2
    140 u_strFromUTF32(char16_t *dest,
    141               int32_t destCapacity, 
    142               int32_t *pDestLength,
    143               const UChar32 *src,
    144               int32_t srcLength,
    145               UErrorCode *pErrorCode) {
    146    return u_strFromUTF32WithSub(
    147            dest, destCapacity, pDestLength,
    148            src, srcLength,
    149            U_SENTINEL, nullptr,
    150            pErrorCode);
    151 }
    152 
    153 U_CAPI UChar32* U_EXPORT2 
    154 u_strToUTF32WithSub(UChar32 *dest,
    155             int32_t destCapacity,
    156             int32_t *pDestLength,
    157             const char16_t *src,
    158             int32_t srcLength,
    159             UChar32 subchar, int32_t *pNumSubstitutions,
    160             UErrorCode *pErrorCode) {
    161    const char16_t *srcLimit;
    162    UChar32 ch;
    163    char16_t ch2;
    164    UChar32 *destLimit;
    165    UChar32 *pDest;
    166    int32_t reqLength;
    167    int32_t numSubstitutions;
    168 
    169    /* args check */
    170    if(U_FAILURE(*pErrorCode)){
    171        return nullptr;
    172    }
    173    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
    174        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
    175        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    176    ) {
    177        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    178        return nullptr;
    179    }
    180 
    181    if(pNumSubstitutions != nullptr) {
    182        *pNumSubstitutions = 0;
    183    }
    184 
    185    pDest = dest;
    186    destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
    187    reqLength = 0;
    188    numSubstitutions = 0;
    189 
    190    if(srcLength < 0) {
    191        /* simple loop for conversion of a NUL-terminated BMP string */
    192        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    193            ++src;
    194            if(pDest < destLimit) {
    195                *pDest++ = ch;
    196            } else {
    197                ++reqLength;
    198            }
    199        }
    200        srcLimit = src;
    201        if(ch != 0) {
    202            /* "complicated" case, find the end of the remaining string */
    203            while(*++srcLimit != 0) {}
    204        }
    205    } else {
    206        srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
    207    }
    208 
    209    /* convert with length */
    210    while(src < srcLimit) {
    211        ch = *src++;
    212        if(!U16_IS_SURROGATE(ch)) {
    213            /* write or count ch below */
    214        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    215            ++src;
    216            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    217        } else if((ch = subchar) < 0) {
    218            /* unpaired surrogate */
    219            *pErrorCode = U_INVALID_CHAR_FOUND;
    220            return nullptr;
    221        } else {
    222            ++numSubstitutions;
    223        }
    224        if(pDest < destLimit) {
    225            *pDest++ = ch;
    226        } else {
    227            ++reqLength;
    228        }
    229    }
    230 
    231    reqLength += (int32_t)(pDest - dest);
    232    if(pDestLength) {
    233        *pDestLength = reqLength;
    234    }
    235    if(pNumSubstitutions != nullptr) {
    236        *pNumSubstitutions = numSubstitutions;
    237    }
    238 
    239    /* Terminate the buffer */
    240    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    241 
    242    return dest;
    243 }
    244 
    245 U_CAPI UChar32* U_EXPORT2 
    246 u_strToUTF32(UChar32 *dest, 
    247             int32_t destCapacity,
    248             int32_t *pDestLength,
    249             const char16_t *src,
    250             int32_t srcLength,
    251             UErrorCode *pErrorCode) {
    252    return u_strToUTF32WithSub(
    253            dest, destCapacity, pDestLength,
    254            src, srcLength,
    255            U_SENTINEL, nullptr,
    256            pErrorCode);
    257 }
    258 
    259 U_CAPI char16_t* U_EXPORT2
    260 u_strFromUTF8WithSub(char16_t *dest,
    261              int32_t destCapacity,
    262              int32_t *pDestLength,
    263              const char* src,
    264              int32_t srcLength,
    265              UChar32 subchar, int32_t *pNumSubstitutions,
    266              UErrorCode *pErrorCode){
    267    /* args check */
    268    if(U_FAILURE(*pErrorCode)) {
    269        return nullptr;
    270    }
    271    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
    272        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
    273        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    274    ) {
    275        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    276        return nullptr;
    277    }
    278 
    279    if(pNumSubstitutions!=nullptr) {
    280        *pNumSubstitutions=0;
    281    }
    282    char16_t *pDest = dest;
    283    char16_t *pDestLimit = dest+destCapacity;
    284    int32_t reqLength = 0;
    285    int32_t numSubstitutions=0;
    286 
    287    /*
    288     * Inline processing of UTF-8 byte sequences:
    289     *
    290     * Byte sequences for the most common characters are handled inline in
    291     * the conversion loops. In order to reduce the path lengths for those
    292     * characters, the tests are arranged in a kind of binary search.
    293     * ASCII (<=0x7f) is checked first, followed by the dividing point
    294     * between 2- and 3-byte sequences (0xe0).
    295     * The 3-byte branch is tested first to speed up CJK text.
    296     * The compiler should combine the subtractions for the two tests for 0xe0.
    297     * Each branch then tests for the other end of its range.
    298     */
    299 
    300    if(srcLength < 0){
    301        /*
    302         * Transform a NUL-terminated string.
    303         * The code explicitly checks for NULs only in the lead byte position.
    304         * A NUL byte in the trail byte position fails the trail byte range check anyway.
    305         */
    306        int32_t i;
    307        UChar32 c;
    308        for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
    309            // modified copy of U8_NEXT()
    310            ++i;
    311            if(U8_IS_SINGLE(c)) {
    312                *pDest++=(char16_t)c;
    313            } else {
    314                uint8_t __t1, __t2;
    315                if( /* handle U+0800..U+FFFF inline */
    316                        (0xe0<=(c) && (c)<0xf0) &&
    317                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    318                        (__t2=src[(i)+1]-0x80)<=0x3f) {
    319                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    320                    i+=2;
    321                } else if( /* handle U+0080..U+07FF inline */
    322                        ((c)<0xe0 && (c)>=0xc2) &&
    323                        (__t1=src[i]-0x80)<=0x3f) {
    324                    *pDest++ = (((c)&0x1f)<<6)|__t1;
    325                    ++(i);
    326                } else {
    327                    /* function call for "complicated" and error cases */
    328                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
    329                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    330                        *pErrorCode = U_INVALID_CHAR_FOUND;
    331                        return nullptr;
    332                    } else if(c<=0xFFFF) {
    333                        *(pDest++)=(char16_t)c;
    334                    } else {
    335                        *(pDest++)=U16_LEAD(c);
    336                        if(pDest<pDestLimit) {
    337                            *(pDest++)=U16_TRAIL(c);
    338                        } else {
    339                            reqLength++;
    340                            break;
    341                        }
    342                    }
    343                }
    344            }
    345        }
    346 
    347        /* Pre-flight the rest of the string. */
    348        while((c = (uint8_t)src[i]) != 0) {
    349            // modified copy of U8_NEXT()
    350            ++i;
    351            if(U8_IS_SINGLE(c)) {
    352                ++reqLength;
    353            } else {
    354                uint8_t __t1, __t2;
    355                if( /* handle U+0800..U+FFFF inline */
    356                        (0xe0<=(c) && (c)<0xf0) &&
    357                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    358                        (__t2=src[(i)+1]-0x80)<=0x3f) {
    359                    ++reqLength;
    360                    i+=2;
    361                } else if( /* handle U+0080..U+07FF inline */
    362                        ((c)<0xe0 && (c)>=0xc2) &&
    363                        (__t1=src[i]-0x80)<=0x3f) {
    364                    ++reqLength;
    365                    ++(i);
    366                } else {
    367                    /* function call for "complicated" and error cases */
    368                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
    369                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    370                        *pErrorCode = U_INVALID_CHAR_FOUND;
    371                        return nullptr;
    372                    }
    373                    reqLength += U16_LENGTH(c);
    374                }
    375            }
    376        }
    377    } else /* srcLength >= 0 */ {
    378        /* Faster loop without ongoing checking for srcLength and pDestLimit. */
    379        int32_t i = 0;
    380        UChar32 c;
    381        for(;;) {
    382            /*
    383             * Each iteration of the inner loop progresses by at most 3 UTF-8
    384             * bytes and one char16_t, for most characters.
    385             * For supplementary code points (4 & 2), which are rare,
    386             * there is an additional adjustment.
    387             */
    388            int32_t count = (int32_t)(pDestLimit - pDest);
    389            int32_t count2 = (srcLength - i) / 3;
    390            if(count > count2) {
    391                count = count2; /* min(remaining dest, remaining src/3) */
    392            }
    393            if(count < 3) {
    394                /*
    395                 * Too much overhead if we get near the end of the string,
    396                 * continue with the next loop.
    397                 */
    398                break;
    399            }
    400 
    401            do {
    402                // modified copy of U8_NEXT()
    403                c = (uint8_t)src[i++];
    404                if(U8_IS_SINGLE(c)) {
    405                    *pDest++=(char16_t)c;
    406                } else {
    407                    uint8_t __t1, __t2;
    408                    if( /* handle U+0800..U+FFFF inline */
    409                            (0xe0<=(c) && (c)<0xf0) &&
    410                            ((i)+1)<srcLength &&
    411                            U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    412                            (__t2=src[(i)+1]-0x80)<=0x3f) {
    413                        *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    414                        i+=2;
    415                    } else if( /* handle U+0080..U+07FF inline */
    416                            ((c)<0xe0 && (c)>=0xc2) &&
    417                            ((i)!=srcLength) &&
    418                            (__t1=src[i]-0x80)<=0x3f) {
    419                        *pDest++ = (((c)&0x1f)<<6)|__t1;
    420                        ++(i);
    421                    } else {
    422                        if(c >= 0xf0 || subchar > 0xffff) {
    423                            // We may read up to four bytes and write up to two UChars,
    424                            // which we didn't account for with computing count,
    425                            // so we adjust it here.
    426                            if(--count == 0) {
    427                                --i;  // back out byte c
    428                                break;
    429                            }
    430                        }
    431 
    432                        /* function call for "complicated" and error cases */
    433                        (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    434                        if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    435                            *pErrorCode = U_INVALID_CHAR_FOUND;
    436                            return nullptr;
    437                        } else if(c<=0xFFFF) {
    438                            *(pDest++)=(char16_t)c;
    439                        } else {
    440                            *(pDest++)=U16_LEAD(c);
    441                            *(pDest++)=U16_TRAIL(c);
    442                        }
    443                    }
    444                }
    445            } while(--count > 0);
    446        }
    447 
    448        while(i < srcLength && (pDest < pDestLimit)) {
    449            // modified copy of U8_NEXT()
    450            c = (uint8_t)src[i++];
    451            if(U8_IS_SINGLE(c)) {
    452                *pDest++=(char16_t)c;
    453            } else {
    454                uint8_t __t1, __t2;
    455                if( /* handle U+0800..U+FFFF inline */
    456                        (0xe0<=(c) && (c)<0xf0) &&
    457                        ((i)+1)<srcLength &&
    458                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    459                        (__t2=src[(i)+1]-0x80)<=0x3f) {
    460                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    461                    i+=2;
    462                } else if( /* handle U+0080..U+07FF inline */
    463                        ((c)<0xe0 && (c)>=0xc2) &&
    464                        ((i)!=srcLength) &&
    465                        (__t1=src[i]-0x80)<=0x3f) {
    466                    *pDest++ = (((c)&0x1f)<<6)|__t1;
    467                    ++(i);
    468                } else {
    469                    /* function call for "complicated" and error cases */
    470                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    471                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    472                        *pErrorCode = U_INVALID_CHAR_FOUND;
    473                        return nullptr;
    474                    } else if(c<=0xFFFF) {
    475                        *(pDest++)=(char16_t)c;
    476                    } else {
    477                        *(pDest++)=U16_LEAD(c);
    478                        if(pDest<pDestLimit) {
    479                            *(pDest++)=U16_TRAIL(c);
    480                        } else {
    481                            reqLength++;
    482                            break;
    483                        }
    484                    }
    485                }
    486            }
    487        }
    488 
    489        /* Pre-flight the rest of the string. */
    490        while(i < srcLength) {
    491            // modified copy of U8_NEXT()
    492            c = (uint8_t)src[i++];
    493            if(U8_IS_SINGLE(c)) {
    494                ++reqLength;
    495            } else {
    496                uint8_t __t1, __t2;
    497                if( /* handle U+0800..U+FFFF inline */
    498                        (0xe0<=(c) && (c)<0xf0) &&
    499                        ((i)+1)<srcLength &&
    500                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    501                        (__t2=src[(i)+1]-0x80)<=0x3f) {
    502                    ++reqLength;
    503                    i+=2;
    504                } else if( /* handle U+0080..U+07FF inline */
    505                        ((c)<0xe0 && (c)>=0xc2) &&
    506                        ((i)!=srcLength) &&
    507                        (__t1=src[i]-0x80)<=0x3f) {
    508                    ++reqLength;
    509                    ++(i);
    510                } else {
    511                    /* function call for "complicated" and error cases */
    512                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    513                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    514                        *pErrorCode = U_INVALID_CHAR_FOUND;
    515                        return nullptr;
    516                    }
    517                    reqLength += U16_LENGTH(c);
    518                }
    519            }
    520        }
    521    }
    522 
    523    reqLength+=(int32_t)(pDest - dest);
    524 
    525    if(pNumSubstitutions!=nullptr) {
    526        *pNumSubstitutions=numSubstitutions;
    527    }
    528 
    529    if(pDestLength){
    530        *pDestLength = reqLength;
    531    }
    532 
    533    /* Terminate the buffer */
    534    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    535 
    536    return dest;
    537 }
    538 
    539 U_CAPI char16_t* U_EXPORT2
    540 u_strFromUTF8(char16_t *dest,
    541              int32_t destCapacity,
    542              int32_t *pDestLength,
    543              const char* src,
    544              int32_t srcLength,
    545              UErrorCode *pErrorCode){
    546    return u_strFromUTF8WithSub(
    547            dest, destCapacity, pDestLength,
    548            src, srcLength,
    549            U_SENTINEL, nullptr,
    550            pErrorCode);
    551 }
    552 
    553 U_CAPI char16_t * U_EXPORT2
    554 u_strFromUTF8Lenient(char16_t *dest,
    555                     int32_t destCapacity,
    556                     int32_t *pDestLength,
    557                     const char *src,
    558                     int32_t srcLength,
    559                     UErrorCode *pErrorCode) {
    560    char16_t *pDest = dest;
    561    UChar32 ch;
    562    int32_t reqLength = 0;
    563    uint8_t* pSrc = (uint8_t*) src;
    564 
    565    /* args check */
    566    if(U_FAILURE(*pErrorCode)){
    567        return nullptr;
    568    }
    569        
    570    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
    571        (destCapacity<0) || (dest == nullptr && destCapacity > 0)
    572    ) {
    573        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    574        return nullptr;
    575    }
    576 
    577    if(srcLength < 0) {
    578        /* Transform a NUL-terminated string. */
    579        char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr;
    580        uint8_t t1, t2, t3; /* trail bytes */
    581 
    582        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    583            if(ch < 0xc0) {
    584                /*
    585                 * ASCII, or a trail byte in lead position which is treated like
    586                 * a single-byte sequence for better character boundary
    587                 * resynchronization after illegal sequences.
    588                 */
    589                *pDest++=(char16_t)ch;
    590                ++pSrc;
    591                continue;
    592            } else if(ch < 0xe0) { /* U+0080..U+07FF */
    593                if((t1 = pSrc[1]) != 0) {
    594                    /* 0x3080 = (0xc0 << 6) + 0x80 */
    595                    *pDest++ = (char16_t)((ch << 6) + t1 - 0x3080);
    596                    pSrc += 2;
    597                    continue;
    598                }
    599            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    600                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    601                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
    602                    /* 0x2080 = (0x80 << 6) + 0x80 */
    603                    *pDest++ = (char16_t)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    604                    pSrc += 3;
    605                    continue;
    606                }
    607            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    608                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    609                    pSrc += 4;
    610                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    611                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    612                    *(pDest++) = U16_LEAD(ch);
    613                    if(pDest < pDestLimit) {
    614                        *(pDest++) = U16_TRAIL(ch);
    615                    } else {
    616                        reqLength = 1;
    617                        break;
    618                    }
    619                    continue;
    620                }
    621            }
    622 
    623            /* truncated character at the end */
    624            *pDest++ = 0xfffd;
    625            while(*++pSrc != 0) {}
    626            break;
    627        }
    628 
    629        /* Pre-flight the rest of the string. */
    630        while((ch = *pSrc) != 0) {
    631            if(ch < 0xc0) {
    632                /*
    633                 * ASCII, or a trail byte in lead position which is treated like
    634                 * a single-byte sequence for better character boundary
    635                 * resynchronization after illegal sequences.
    636                 */
    637                ++reqLength;
    638                ++pSrc;
    639                continue;
    640            } else if(ch < 0xe0) { /* U+0080..U+07FF */
    641                if(pSrc[1] != 0) {
    642                    ++reqLength;
    643                    pSrc += 2;
    644                    continue;
    645                }
    646            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    647                if(pSrc[1] != 0 && pSrc[2] != 0) {
    648                    ++reqLength;
    649                    pSrc += 3;
    650                    continue;
    651                }
    652            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    653                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    654                    reqLength += 2;
    655                    pSrc += 4;
    656                    continue;
    657                }
    658            }
    659 
    660            /* truncated character at the end */
    661            ++reqLength;
    662            break;
    663        }
    664    } else /* srcLength >= 0 */ {
    665      const uint8_t *pSrcLimit = (pSrc!=nullptr)?(pSrc + srcLength):nullptr;
    666 
    667        /*
    668         * This function requires that if srcLength is given, then it must be
    669         * destCapatity >= srcLength so that we need not check for
    670         * destination buffer overflow in the loop.
    671         */
    672        if(destCapacity < srcLength) {
    673            if(pDestLength != nullptr) {
    674                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    675            }
    676            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    677            return nullptr;
    678        }
    679 
    680        if((pSrcLimit - pSrc) >= 4) {
    681            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    682 
    683            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    684            do {
    685                ch = *pSrc++;
    686                if(ch < 0xc0) {
    687                    /*
    688                     * ASCII, or a trail byte in lead position which is treated like
    689                     * a single-byte sequence for better character boundary
    690                     * resynchronization after illegal sequences.
    691                     */
    692                    *pDest++=(char16_t)ch;
    693                } else if(ch < 0xe0) { /* U+0080..U+07FF */
    694                    /* 0x3080 = (0xc0 << 6) + 0x80 */
    695                    *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
    696                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    697                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
    698                    /* 0x2080 = (0x80 << 6) + 0x80 */
    699                    ch = (ch << 12) + (*pSrc++ << 6);
    700                    *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
    701                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    702                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    703                    ch = (ch << 18) + (*pSrc++ << 12);
    704                    ch += *pSrc++ << 6;
    705                    ch += *pSrc++ - 0x3c82080;
    706                    *(pDest++) = U16_LEAD(ch);
    707                    *(pDest++) = U16_TRAIL(ch);
    708                }
    709            } while(pSrc < pSrcLimit);
    710 
    711            pSrcLimit += 3; /* restore original pSrcLimit */
    712        }
    713 
    714        while(pSrc < pSrcLimit) {
    715            ch = *pSrc++;
    716            if(ch < 0xc0) {
    717                /*
    718                 * ASCII, or a trail byte in lead position which is treated like
    719                 * a single-byte sequence for better character boundary
    720                 * resynchronization after illegal sequences.
    721                 */
    722                *pDest++=(char16_t)ch;
    723                continue;
    724            } else if(ch < 0xe0) { /* U+0080..U+07FF */
    725                if(pSrc < pSrcLimit) {
    726                    /* 0x3080 = (0xc0 << 6) + 0x80 */
    727                    *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
    728                    continue;
    729                }
    730            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    731                if((pSrcLimit - pSrc) >= 2) {
    732                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
    733                    /* 0x2080 = (0x80 << 6) + 0x80 */
    734                    ch = (ch << 12) + (*pSrc++ << 6);
    735                    *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
    736                    pSrc += 3;
    737                    continue;
    738                }
    739            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    740                if((pSrcLimit - pSrc) >= 3) {
    741                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    742                    ch = (ch << 18) + (*pSrc++ << 12);
    743                    ch += *pSrc++ << 6;
    744                    ch += *pSrc++ - 0x3c82080;
    745                    *(pDest++) = U16_LEAD(ch);
    746                    *(pDest++) = U16_TRAIL(ch);
    747                    pSrc += 4;
    748                    continue;
    749                }
    750            }
    751 
    752            /* truncated character at the end */
    753            *pDest++ = 0xfffd;
    754            break;
    755        }
    756    }
    757 
    758    reqLength+=(int32_t)(pDest - dest);
    759 
    760    if(pDestLength){
    761        *pDestLength = reqLength;
    762    }
    763 
    764    /* Terminate the buffer */
    765    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    766 
    767    return dest;
    768 }
    769 
    770 static inline uint8_t *
    771 _appendUTF8(uint8_t *pDest, UChar32 c) {
    772    /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    773    if((c)<=0x7f) {
    774        *pDest++ = static_cast<uint8_t>(c);
    775    } else if(c<=0x7ff) {
    776        *pDest++ = static_cast<uint8_t>((c >> 6) | 0xc0);
    777        *pDest++ = static_cast<uint8_t>((c & 0x3f) | 0x80);
    778    } else if(c<=0xffff) {
    779        *pDest++ = static_cast<uint8_t>((c >> 12) | 0xe0);
    780        *pDest++ = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
    781        *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
    782    } else /* if((uint32_t)(c)<=0x10ffff) */ {
    783        *pDest++ = static_cast<uint8_t>(((c) >> 18) | 0xf0);
    784        *pDest++ = static_cast<uint8_t>((((c) >> 12) & 0x3f) | 0x80);
    785        *pDest++ = static_cast<uint8_t>((((c) >> 6) & 0x3f) | 0x80);
    786        *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
    787    }
    788    return pDest;
    789 }
    790 
    791   
    792 U_CAPI char* U_EXPORT2 
    793 u_strToUTF8WithSub(char *dest,
    794            int32_t destCapacity,
    795            int32_t *pDestLength,
    796            const char16_t *pSrc,
    797            int32_t srcLength,
    798            UChar32 subchar, int32_t *pNumSubstitutions,
    799            UErrorCode *pErrorCode){
    800    int32_t reqLength=0;
    801    uint32_t ch=0,ch2=0;
    802    uint8_t *pDest = (uint8_t *)dest;
    803    uint8_t *pDestLimit = (pDest!=nullptr)?(pDest + destCapacity):nullptr;
    804    int32_t numSubstitutions;
    805 
    806    /* args check */
    807    if(U_FAILURE(*pErrorCode)){
    808        return nullptr;
    809    }
    810        
    811    if( (pSrc==nullptr && srcLength!=0) || srcLength < -1 ||
    812        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
    813        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    814    ) {
    815        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    816        return nullptr;
    817    }
    818 
    819    if(pNumSubstitutions!=nullptr) {
    820        *pNumSubstitutions=0;
    821    }
    822    numSubstitutions=0;
    823 
    824    if(srcLength==-1) {
    825        while((ch=*pSrc)!=0) {
    826            ++pSrc;
    827            if(ch <= 0x7f) {
    828                if(pDest<pDestLimit) {
    829                    *pDest++ = (uint8_t)ch;
    830                } else {
    831                    reqLength = 1;
    832                    break;
    833                }
    834            } else if(ch <= 0x7ff) {
    835                if((pDestLimit - pDest) >= 2) {
    836                    *pDest++=(uint8_t)((ch>>6)|0xc0);
    837                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    838                } else {
    839                    reqLength = 2;
    840                    break;
    841                }
    842            } else if(ch <= 0xd7ff || ch >= 0xe000) {
    843                if((pDestLimit - pDest) >= 3) {
    844                    *pDest++=(uint8_t)((ch>>12)|0xe0);
    845                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    846                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    847                } else {
    848                    reqLength = 3;
    849                    break;
    850                }
    851            } else /* ch is a surrogate */ {
    852                int32_t length;
    853 
    854                /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
    855                if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
    856                    ++pSrc;
    857                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
    858                } else if(subchar>=0) {
    859                    ch=subchar;
    860                    ++numSubstitutions;
    861                } else {
    862                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    863                    *pErrorCode = U_INVALID_CHAR_FOUND;
    864                    return nullptr;
    865                }
    866 
    867                length = U8_LENGTH(ch);
    868                if((pDestLimit - pDest) >= length) {
    869                    /* convert and append*/
    870                    pDest=_appendUTF8(pDest, ch);
    871                } else {
    872                    reqLength = length;
    873                    break;
    874                }
    875            }
    876        }
    877        while((ch=*pSrc++)!=0) {
    878            if(ch<=0x7f) {
    879                ++reqLength;
    880            } else if(ch<=0x7ff) {
    881                reqLength+=2;
    882            } else if(!U16_IS_SURROGATE(ch)) {
    883                reqLength+=3;
    884            } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
    885                ++pSrc;
    886                reqLength+=4;
    887            } else if(subchar>=0) {
    888                reqLength+=U8_LENGTH(subchar);
    889                ++numSubstitutions;
    890            } else {
    891                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    892                *pErrorCode = U_INVALID_CHAR_FOUND;
    893                return nullptr;
    894            }
    895        }
    896    } else {
    897        const char16_t *pSrcLimit = (pSrc!=nullptr)?(pSrc+srcLength):nullptr;
    898        int32_t count;
    899 
    900        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    901        for(;;) {
    902            /*
    903             * Each iteration of the inner loop progresses by at most 3 UTF-8
    904             * bytes and one char16_t, for most characters.
    905             * For supplementary code points (4 & 2), which are rare,
    906             * there is an additional adjustment.
    907             */
    908            count = (int32_t)((pDestLimit - pDest) / 3);
    909            srcLength = (int32_t)(pSrcLimit - pSrc);
    910            if(count > srcLength) {
    911                count = srcLength; /* min(remaining dest/3, remaining src) */
    912            }
    913            if(count < 3) {
    914                /*
    915                 * Too much overhead if we get near the end of the string,
    916                 * continue with the next loop.
    917                 */
    918                break;
    919            }
    920            do {
    921                ch=*pSrc++;
    922                if(ch <= 0x7f) {
    923                    *pDest++ = (uint8_t)ch;
    924                } else if(ch <= 0x7ff) {
    925                    *pDest++=(uint8_t)((ch>>6)|0xc0);
    926                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    927                } else if(ch <= 0xd7ff || ch >= 0xe000) {
    928                    *pDest++=(uint8_t)((ch>>12)|0xe0);
    929                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    930                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    931                } else /* ch is a surrogate */ {
    932                    /*
    933                     * We will read two UChars and probably output four bytes,
    934                     * which we didn't account for with computing count,
    935                     * so we adjust it here.
    936                     */
    937                    if(--count == 0) {
    938                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
    939                        break;  /* recompute count */
    940                    }
    941 
    942                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
    943                        ++pSrc;
    944                        ch=U16_GET_SUPPLEMENTARY(ch, ch2);
    945 
    946                        /* writing 4 bytes per 2 UChars is ok */
    947                        *pDest++=(uint8_t)((ch>>18)|0xf0);
    948                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
    949                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    950                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
    951                    } else  {
    952                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    953                        if(subchar>=0) {
    954                            ch=subchar;
    955                            ++numSubstitutions;
    956                        } else {
    957                            *pErrorCode = U_INVALID_CHAR_FOUND;
    958                            return nullptr;
    959                        }
    960 
    961                        /* convert and append*/
    962                        pDest=_appendUTF8(pDest, ch);
    963                    }
    964                }
    965            } while(--count > 0);
    966        }
    967 
    968        while(pSrc<pSrcLimit) {
    969            ch=*pSrc++;
    970            if(ch <= 0x7f) {
    971                if(pDest<pDestLimit) {
    972                    *pDest++ = (uint8_t)ch;
    973                } else {
    974                    reqLength = 1;
    975                    break;
    976                }
    977            } else if(ch <= 0x7ff) {
    978                if((pDestLimit - pDest) >= 2) {
    979                    *pDest++=(uint8_t)((ch>>6)|0xc0);
    980                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    981                } else {
    982                    reqLength = 2;
    983                    break;
    984                }
    985            } else if(ch <= 0xd7ff || ch >= 0xe000) {
    986                if((pDestLimit - pDest) >= 3) {
    987                    *pDest++=(uint8_t)((ch>>12)|0xe0);
    988                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    989                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
    990                } else {
    991                    reqLength = 3;
    992                    break;
    993                }
    994            } else /* ch is a surrogate */ {
    995                int32_t length;
    996 
    997                if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 
    998                    ++pSrc;
    999                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1000                } else if(subchar>=0) {
   1001                    ch=subchar;
   1002                    ++numSubstitutions;
   1003                } else {
   1004                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1005                    *pErrorCode = U_INVALID_CHAR_FOUND;
   1006                    return nullptr;
   1007                }
   1008 
   1009                length = U8_LENGTH(ch);
   1010                if((pDestLimit - pDest) >= length) {
   1011                    /* convert and append*/
   1012                    pDest=_appendUTF8(pDest, ch);
   1013                } else {
   1014                    reqLength = length;
   1015                    break;
   1016                }
   1017            }
   1018        }
   1019        while(pSrc<pSrcLimit) {
   1020            ch=*pSrc++;
   1021            if(ch<=0x7f) {
   1022                ++reqLength;
   1023            } else if(ch<=0x7ff) {
   1024                reqLength+=2;
   1025            } else if(!U16_IS_SURROGATE(ch)) {
   1026                reqLength+=3;
   1027            } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1028                ++pSrc;
   1029                reqLength+=4;
   1030            } else if(subchar>=0) {
   1031                reqLength+=U8_LENGTH(subchar);
   1032                ++numSubstitutions;
   1033            } else {
   1034                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1035                *pErrorCode = U_INVALID_CHAR_FOUND;
   1036                return nullptr;
   1037            }
   1038        }
   1039    }
   1040 
   1041    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1042 
   1043    if(pNumSubstitutions!=nullptr) {
   1044        *pNumSubstitutions=numSubstitutions;
   1045    }
   1046 
   1047    if(pDestLength){
   1048        *pDestLength = reqLength;
   1049    }
   1050 
   1051    /* Terminate the buffer */
   1052    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1053    return dest;
   1054 }
   1055 
   1056 U_CAPI char* U_EXPORT2 
   1057 u_strToUTF8(char *dest,
   1058            int32_t destCapacity,
   1059            int32_t *pDestLength,
   1060            const char16_t *pSrc,
   1061            int32_t srcLength,
   1062            UErrorCode *pErrorCode){
   1063    return u_strToUTF8WithSub(
   1064            dest, destCapacity, pDestLength,
   1065            pSrc, srcLength,
   1066            U_SENTINEL, nullptr,
   1067            pErrorCode);
   1068 }
   1069 
   1070 U_CAPI char16_t* U_EXPORT2
   1071 u_strFromJavaModifiedUTF8WithSub(
   1072        char16_t *dest,
   1073        int32_t destCapacity,
   1074        int32_t *pDestLength,
   1075        const char *src,
   1076        int32_t srcLength,
   1077        UChar32 subchar, int32_t *pNumSubstitutions,
   1078        UErrorCode *pErrorCode) {
   1079    /* args check */
   1080    if(U_FAILURE(*pErrorCode)) {
   1081        return nullptr;
   1082    }
   1083    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
   1084        (dest==nullptr && destCapacity!=0) || destCapacity<0 ||
   1085        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1086    ) {
   1087        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1088        return nullptr;
   1089    }
   1090 
   1091    if(pNumSubstitutions!=nullptr) {
   1092        *pNumSubstitutions=0;
   1093    }
   1094    char16_t *pDest = dest;
   1095    char16_t *pDestLimit = dest+destCapacity;
   1096    int32_t reqLength = 0;
   1097    int32_t numSubstitutions=0;
   1098 
   1099    if(srcLength < 0) {
   1100        /*
   1101         * Transform a NUL-terminated ASCII string.
   1102         * Handle non-ASCII strings with slower code.
   1103         */
   1104        UChar32 c;
   1105        while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
   1106            *pDest++=(char16_t)c;
   1107            ++src;
   1108        }
   1109        if(c == 0) {
   1110            reqLength=(int32_t)(pDest - dest);
   1111            if(pDestLength) {
   1112                *pDestLength = reqLength;
   1113            }
   1114 
   1115            /* Terminate the buffer */
   1116            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1117            return dest;
   1118        }
   1119        srcLength = static_cast<int32_t>(uprv_strlen(src));
   1120    }
   1121 
   1122    /* Faster loop without ongoing checking for srcLength and pDestLimit. */
   1123    UChar32 ch;
   1124    uint8_t t1, t2;
   1125    int32_t i = 0;
   1126    for(;;) {
   1127        int32_t count = (int32_t)(pDestLimit - pDest);
   1128        int32_t count2 = srcLength - i;
   1129        if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
   1130            /* fast ASCII loop */
   1131            int32_t start = i;
   1132            uint8_t b;
   1133            while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
   1134                *pDest++=b;
   1135                ++i;
   1136            }
   1137            int32_t delta = i - start;
   1138            count -= delta;
   1139            count2 -= delta;
   1140        }
   1141        /*
   1142         * Each iteration of the inner loop progresses by at most 3 UTF-8
   1143         * bytes and one char16_t.
   1144         */
   1145        if(subchar > 0xFFFF) {
   1146            break;
   1147        }
   1148        count2 /= 3;
   1149        if(count > count2) {
   1150            count = count2; /* min(remaining dest, remaining src/3) */
   1151        }
   1152        if(count < 3) {
   1153            /*
   1154             * Too much overhead if we get near the end of the string,
   1155             * continue with the next loop.
   1156             */
   1157            break;
   1158        }
   1159        do {
   1160            ch = (uint8_t)src[i++];
   1161            if(U8_IS_SINGLE(ch)) {
   1162                *pDest++=(char16_t)ch;
   1163            } else {
   1164                if(ch >= 0xe0) {
   1165                    if( /* handle U+0000..U+FFFF inline */
   1166                        ch <= 0xef &&
   1167                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
   1168                        (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
   1169                    ) {
   1170                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
   1171                        *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
   1172                        i += 2;
   1173                        continue;
   1174                    }
   1175                } else {
   1176                    if( /* handle U+0000..U+07FF inline */
   1177                        ch >= 0xc0 &&
   1178                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
   1179                    ) {
   1180                        *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
   1181                        ++i;
   1182                        continue;
   1183                    }
   1184                }
   1185 
   1186                if(subchar < 0) {
   1187                    *pErrorCode = U_INVALID_CHAR_FOUND;
   1188                    return nullptr;
   1189                } else if(subchar > 0xffff && --count == 0) {
   1190                    /*
   1191                     * We need to write two UChars, adjusted count for that,
   1192                     * and ran out of space.
   1193                     */
   1194                    --i;  // back out byte ch
   1195                    break;
   1196                } else {
   1197                    /* function call for error cases */
   1198                    utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1199                    ++numSubstitutions;
   1200                    *(pDest++)=(char16_t)subchar;
   1201                }
   1202            }
   1203        } while(--count > 0);
   1204    }
   1205 
   1206    while(i < srcLength && (pDest < pDestLimit)) {
   1207        ch = (uint8_t)src[i++];
   1208        if(U8_IS_SINGLE(ch)){
   1209            *pDest++=(char16_t)ch;
   1210        } else {
   1211            if(ch >= 0xe0) {
   1212                if( /* handle U+0000..U+FFFF inline */
   1213                    ch <= 0xef &&
   1214                    (i+1) < srcLength &&
   1215                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
   1216                    (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
   1217                ) {
   1218                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
   1219                    *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
   1220                    i += 2;
   1221                    continue;
   1222                }
   1223            } else {
   1224                if( /* handle U+0000..U+07FF inline */
   1225                    ch >= 0xc0 &&
   1226                    i < srcLength &&
   1227                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
   1228                ) {
   1229                    *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
   1230                    ++i;
   1231                    continue;
   1232                }
   1233            }
   1234 
   1235            if(subchar < 0) {
   1236                *pErrorCode = U_INVALID_CHAR_FOUND;
   1237                return nullptr;
   1238            } else {
   1239                /* function call for error cases */
   1240                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1241                ++numSubstitutions;
   1242                if(subchar<=0xFFFF) {
   1243                    *(pDest++)=(char16_t)subchar;
   1244                } else {
   1245                    *(pDest++)=U16_LEAD(subchar);
   1246                    if(pDest<pDestLimit) {
   1247                        *(pDest++)=U16_TRAIL(subchar);
   1248                    } else {
   1249                        reqLength++;
   1250                        break;
   1251                    }
   1252                }
   1253            }
   1254        }
   1255    }
   1256 
   1257    /* Pre-flight the rest of the string. */
   1258    while(i < srcLength) {
   1259        ch = (uint8_t)src[i++];
   1260        if(U8_IS_SINGLE(ch)) {
   1261            reqLength++;
   1262        } else {
   1263            if(ch >= 0xe0) {
   1264                if( /* handle U+0000..U+FFFF inline */
   1265                    ch <= 0xef &&
   1266                    (i+1) < srcLength &&
   1267                    (uint8_t)(src[i] - 0x80) <= 0x3f &&
   1268                    (uint8_t)(src[i+1] - 0x80) <= 0x3f
   1269                ) {
   1270                    reqLength++;
   1271                    i += 2;
   1272                    continue;
   1273                }
   1274            } else {
   1275                if( /* handle U+0000..U+07FF inline */
   1276                    ch >= 0xc0 &&
   1277                    i < srcLength &&
   1278                    (uint8_t)(src[i] - 0x80) <= 0x3f
   1279                ) {
   1280                    reqLength++;
   1281                    ++i;
   1282                    continue;
   1283                }
   1284            }
   1285 
   1286            if(subchar < 0) {
   1287                *pErrorCode = U_INVALID_CHAR_FOUND;
   1288                return nullptr;
   1289            } else {
   1290                /* function call for error cases */
   1291                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1292                ++numSubstitutions;
   1293                reqLength+=U16_LENGTH(ch);
   1294            }
   1295        }
   1296    }
   1297 
   1298    if(pNumSubstitutions!=nullptr) {
   1299        *pNumSubstitutions=numSubstitutions;
   1300    }
   1301 
   1302    reqLength+=(int32_t)(pDest - dest);
   1303    if(pDestLength) {
   1304        *pDestLength = reqLength;
   1305    }
   1306 
   1307    /* Terminate the buffer */
   1308    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1309    return dest;
   1310 }
   1311 
   1312 U_CAPI char* U_EXPORT2 
   1313 u_strToJavaModifiedUTF8(
   1314        char *dest,
   1315        int32_t destCapacity,
   1316        int32_t *pDestLength,
   1317        const char16_t *src,
   1318        int32_t srcLength,
   1319        UErrorCode *pErrorCode) {
   1320    int32_t reqLength=0;
   1321    uint32_t ch=0;
   1322    const char16_t *pSrcLimit;
   1323    int32_t count;
   1324 
   1325    /* args check */
   1326    if(U_FAILURE(*pErrorCode)){
   1327        return nullptr;
   1328    }
   1329    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
   1330        (dest==nullptr && destCapacity!=0) || destCapacity<0
   1331    ) {
   1332        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1333        return nullptr;
   1334    }
   1335    uint8_t *pDest = (uint8_t *)dest;
   1336    uint8_t *pDestLimit = pDest + destCapacity;
   1337 
   1338    if(srcLength==-1) {
   1339        /* Convert NUL-terminated ASCII, then find the string length. */
   1340        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1341            *pDest++ = (uint8_t)ch;
   1342            ++src;
   1343        }
   1344        if(ch == 0) {
   1345            reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1346            if(pDestLength) {
   1347                *pDestLength = reqLength;
   1348            }
   1349 
   1350            /* Terminate the buffer */
   1351            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1352            return dest;
   1353        }
   1354        srcLength = u_strlen(src);
   1355    }
   1356 
   1357    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1358    pSrcLimit = (src!=nullptr)?(src+srcLength):nullptr;
   1359    for(;;) {
   1360        count = (int32_t)(pDestLimit - pDest);
   1361        srcLength = (int32_t)(pSrcLimit - src);
   1362        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1363            /* fast ASCII loop */
   1364            const char16_t *prevSrc = src;
   1365            int32_t delta;
   1366            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1367                *pDest++=(uint8_t)ch;
   1368                ++src;
   1369            }
   1370            delta = (int32_t)(src - prevSrc);
   1371            count -= delta;
   1372            srcLength -= delta;
   1373        }
   1374        /*
   1375         * Each iteration of the inner loop progresses by at most 3 UTF-8
   1376         * bytes and one char16_t.
   1377         */
   1378        count /= 3;
   1379        if(count > srcLength) {
   1380            count = srcLength; /* min(remaining dest/3, remaining src) */
   1381        }
   1382        if(count < 3) {
   1383            /*
   1384             * Too much overhead if we get near the end of the string,
   1385             * continue with the next loop.
   1386             */
   1387            break;
   1388        }
   1389        do {
   1390            ch=*src++;
   1391            if(ch <= 0x7f && ch != 0) {
   1392                *pDest++ = (uint8_t)ch;
   1393            } else if(ch <= 0x7ff) {
   1394                *pDest++=(uint8_t)((ch>>6)|0xc0);
   1395                *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1396            } else {
   1397                *pDest++=(uint8_t)((ch>>12)|0xe0);
   1398                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1399                *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1400            }
   1401        } while(--count > 0);
   1402    }
   1403 
   1404    while(src<pSrcLimit) {
   1405        ch=*src++;
   1406        if(ch <= 0x7f && ch != 0) {
   1407            if(pDest<pDestLimit) {
   1408                *pDest++ = (uint8_t)ch;
   1409            } else {
   1410                reqLength = 1;
   1411                break;
   1412            }
   1413        } else if(ch <= 0x7ff) {
   1414            if((pDestLimit - pDest) >= 2) {
   1415                *pDest++=(uint8_t)((ch>>6)|0xc0);
   1416                *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1417            } else {
   1418                reqLength = 2;
   1419                break;
   1420            }
   1421        } else {
   1422            if((pDestLimit - pDest) >= 3) {
   1423                *pDest++=(uint8_t)((ch>>12)|0xe0);
   1424                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1425                *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1426            } else {
   1427                reqLength = 3;
   1428                break;
   1429            }
   1430        }
   1431    }
   1432    while(src<pSrcLimit) {
   1433        ch=*src++;
   1434        if(ch <= 0x7f && ch != 0) {
   1435            ++reqLength;
   1436        } else if(ch<=0x7ff) {
   1437            reqLength+=2;
   1438        } else {
   1439            reqLength+=3;
   1440        }
   1441    }
   1442 
   1443    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1444    if(pDestLength){
   1445        *pDestLength = reqLength;
   1446    }
   1447 
   1448    /* Terminate the buffer */
   1449    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1450    return dest;
   1451 }