tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucnv_u32.cpp (40746B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*  
      4 **********************************************************************
      5 *   Copyright (C) 2002-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u32.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     22 
     23 #include "unicode/ucnv.h"
     24 #include "unicode/utf.h"
     25 #include "ucnv_bld.h"
     26 #include "ucnv_cnv.h"
     27 #include "cmemory.h"
     28 
     29 #define MAXIMUM_UCS2            0x0000FFFF
     30 #define MAXIMUM_UTF             0x0010FFFF
     31 #define HALF_SHIFT              10
     32 #define HALF_BASE               0x0010000
     33 #define HALF_MASK               0x3FF
     34 #define SURROGATE_HIGH_START    0xD800
     35 #define SURROGATE_LOW_START     0xDC00
     36 
     37 /* -SURROGATE_LOW_START + HALF_BASE */
     38 #define SURROGATE_LOW_BASE      9216
     39 
     40 enum {
     41    UCNV_NEED_TO_WRITE_BOM=1
     42 };
     43 
     44 /* UTF-32BE ----------------------------------------------------------------- */
     45 U_CDECL_BEGIN
     46 static void U_CALLCONV
     47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
     48                                UErrorCode * err)
     49 {
     50    const unsigned char *mySource = (unsigned char *) args->source;
     51    char16_t *myTarget = args->target;
     52    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     53    const char16_t *targetLimit = args->targetLimit;
     54    unsigned char *toUBytes = args->converter->toUBytes;
     55    uint32_t ch, i;
     56 
     57    /* Restore state of current sequence */
     58    if (args->converter->toULength > 0 && myTarget < targetLimit) {
     59        i = args->converter->toULength;       /* restore # of bytes consumed */
     60        args->converter->toULength = 0;
     61 
     62        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
     63        args->converter->toUnicodeStatus = 0;
     64        goto morebytes;
     65    }
     66 
     67    while (mySource < sourceLimit && myTarget < targetLimit) {
     68        i = 0;
     69        ch = 0;
     70 morebytes:
     71        while (i < sizeof(uint32_t)) {
     72            if (mySource < sourceLimit) {
     73                ch = (ch << 8) | (uint8_t)(*mySource);
     74                toUBytes[i++] = (char) *(mySource++);
     75            }
     76            else {
     77                /* stores a partially calculated target*/
     78                /* + 1 to make 0 a valid character */
     79                args->converter->toUnicodeStatus = ch + 1;
     80                args->converter->toULength = (int8_t) i;
     81                goto donefornow;
     82            }
     83        }
     84 
     85        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
     86            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
     87            if (ch <= MAXIMUM_UCS2) 
     88            {
     89                /* fits in 16 bits */
     90                *(myTarget++) = (char16_t) ch;
     91            }
     92            else {
     93                /* write out the surrogates */
     94                *(myTarget++) = U16_LEAD(ch);
     95                ch = U16_TRAIL(ch);
     96                if (myTarget < targetLimit) {
     97                    *(myTarget++) = (char16_t)ch;
     98                }
     99                else {
    100                    /* Put in overflow buffer (not handled here) */
    101                    args->converter->UCharErrorBuffer[0] = (char16_t) ch;
    102                    args->converter->UCharErrorBufferLength = 1;
    103                    *err = U_BUFFER_OVERFLOW_ERROR;
    104                    break;
    105                }
    106            }
    107        }
    108        else {
    109            args->converter->toULength = (int8_t)i;
    110            *err = U_ILLEGAL_CHAR_FOUND;
    111            break;
    112        }
    113    }
    114 
    115 donefornow:
    116    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    117        /* End of target buffer */
    118        *err = U_BUFFER_OVERFLOW_ERROR;
    119    }
    120 
    121    args->target = myTarget;
    122    args->source = (const char *) mySource;
    123 }
    124 
    125 static void U_CALLCONV
    126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    127                                             UErrorCode * err)
    128 {
    129    const unsigned char *mySource = (unsigned char *) args->source;
    130    char16_t *myTarget = args->target;
    131    int32_t *myOffsets = args->offsets;
    132    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    133    const char16_t *targetLimit = args->targetLimit;
    134    unsigned char *toUBytes = args->converter->toUBytes;
    135    uint32_t ch, i;
    136    int32_t offsetNum = 0;
    137 
    138    /* Restore state of current sequence */
    139    if (args->converter->toULength > 0 && myTarget < targetLimit) {
    140        i = args->converter->toULength;       /* restore # of bytes consumed */
    141        args->converter->toULength = 0;
    142 
    143        ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
    144        args->converter->toUnicodeStatus = 0;
    145        goto morebytes;
    146    }
    147 
    148    while (mySource < sourceLimit && myTarget < targetLimit) {
    149        i = 0;
    150        ch = 0;
    151 morebytes:
    152        while (i < sizeof(uint32_t)) {
    153            if (mySource < sourceLimit) {
    154                ch = (ch << 8) | (uint8_t)(*mySource);
    155                toUBytes[i++] = (char) *(mySource++);
    156            }
    157            else {
    158                /* stores a partially calculated target*/
    159                /* + 1 to make 0 a valid character */
    160                args->converter->toUnicodeStatus = ch + 1;
    161                args->converter->toULength = (int8_t) i;
    162                goto donefornow;
    163            }
    164        }
    165 
    166        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    167            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    168            if (ch <= MAXIMUM_UCS2) {
    169                /* fits in 16 bits */
    170                *(myTarget++) = (char16_t) ch;
    171                *(myOffsets++) = offsetNum;
    172            }
    173            else {
    174                /* write out the surrogates */
    175                *(myTarget++) = U16_LEAD(ch);
    176                *myOffsets++ = offsetNum;
    177                ch = U16_TRAIL(ch);
    178                if (myTarget < targetLimit)
    179                {
    180                    *(myTarget++) = (char16_t)ch;
    181                    *(myOffsets++) = offsetNum;
    182                }
    183                else {
    184                    /* Put in overflow buffer (not handled here) */
    185                    args->converter->UCharErrorBuffer[0] = (char16_t) ch;
    186                    args->converter->UCharErrorBufferLength = 1;
    187                    *err = U_BUFFER_OVERFLOW_ERROR;
    188                    break;
    189                }
    190            }
    191        }
    192        else {
    193            args->converter->toULength = (int8_t)i;
    194            *err = U_ILLEGAL_CHAR_FOUND;
    195            break;
    196        }
    197        offsetNum += i;
    198    }
    199 
    200 donefornow:
    201    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    202    {
    203        /* End of target buffer */
    204        *err = U_BUFFER_OVERFLOW_ERROR;
    205    }
    206 
    207    args->target = myTarget;
    208    args->source = (const char *) mySource;
    209    args->offsets = myOffsets;
    210 }
    211 
    212 static void U_CALLCONV
    213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
    214                                  UErrorCode * err)
    215 {
    216    const char16_t *mySource = args->source;
    217    unsigned char *myTarget;
    218    const char16_t *sourceLimit = args->sourceLimit;
    219    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    220    UChar32 ch, ch2;
    221    unsigned int indexToWrite;
    222    unsigned char temp[sizeof(uint32_t)];
    223 
    224    if(mySource >= sourceLimit) {
    225        /* no input, nothing to do */
    226        return;
    227    }
    228 
    229    /* write the BOM if necessary */
    230    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    231        static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
    232        ucnv_fromUWriteBytes(args->converter,
    233                             bom, 4,
    234                             &args->target, args->targetLimit,
    235                             &args->offsets, -1,
    236                             err);
    237        args->converter->fromUnicodeStatus=0;
    238    }
    239 
    240    myTarget = (unsigned char *) args->target;
    241    temp[0] = 0;
    242 
    243    if (args->converter->fromUChar32) {
    244        ch = args->converter->fromUChar32;
    245        args->converter->fromUChar32 = 0;
    246        goto lowsurogate;
    247    }
    248 
    249    while (mySource < sourceLimit && myTarget < targetLimit) {
    250        ch = *(mySource++);
    251 
    252        if (U_IS_SURROGATE(ch)) {
    253            if (U_IS_LEAD(ch)) {
    254 lowsurogate:
    255                if (mySource < sourceLimit) {
    256                    ch2 = *mySource;
    257                    if (U_IS_TRAIL(ch2)) {
    258                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    259                        mySource++;
    260                    }
    261                    else {
    262                        /* this is an unmatched trail code unit (2nd surrogate) */
    263                        /* callback(illegal) */
    264                        args->converter->fromUChar32 = ch;
    265                        *err = U_ILLEGAL_CHAR_FOUND;
    266                        break;
    267                    }
    268                }
    269                else {
    270                    /* ran out of source */
    271                    args->converter->fromUChar32 = ch;
    272                    if (args->flush) {
    273                        /* this is an unmatched trail code unit (2nd surrogate) */
    274                        /* callback(illegal) */
    275                        *err = U_ILLEGAL_CHAR_FOUND;
    276                    }
    277                    break;
    278                }
    279            }
    280            else {
    281                /* this is an unmatched trail code unit (2nd surrogate) */
    282                /* callback(illegal) */
    283                args->converter->fromUChar32 = ch;
    284                *err = U_ILLEGAL_CHAR_FOUND;
    285                break;
    286            }
    287        }
    288 
    289        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    290        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    291        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    292        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    293 
    294        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    295            if (myTarget < targetLimit) {
    296                *(myTarget++) = temp[indexToWrite];
    297            }
    298            else {
    299                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    300                *err = U_BUFFER_OVERFLOW_ERROR;
    301            }
    302        }
    303    }
    304 
    305    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    306        *err = U_BUFFER_OVERFLOW_ERROR;
    307    }
    308 
    309    args->target = (char *) myTarget;
    310    args->source = mySource;
    311 }
    312 
    313 static void U_CALLCONV
    314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    315                                               UErrorCode * err)
    316 {
    317    const char16_t *mySource = args->source;
    318    unsigned char *myTarget;
    319    int32_t *myOffsets;
    320    const char16_t *sourceLimit = args->sourceLimit;
    321    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    322    UChar32 ch, ch2;
    323    int32_t offsetNum = 0;
    324    unsigned int indexToWrite;
    325    unsigned char temp[sizeof(uint32_t)];
    326 
    327    if(mySource >= sourceLimit) {
    328        /* no input, nothing to do */
    329        return;
    330    }
    331 
    332    /* write the BOM if necessary */
    333    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    334        static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu };
    335        ucnv_fromUWriteBytes(args->converter,
    336                             bom, 4,
    337                             &args->target, args->targetLimit,
    338                             &args->offsets, -1,
    339                             err);
    340        args->converter->fromUnicodeStatus=0;
    341    }
    342 
    343    myTarget = (unsigned char *) args->target;
    344    myOffsets = args->offsets;
    345    temp[0] = 0;
    346 
    347    if (args->converter->fromUChar32) {
    348        ch = args->converter->fromUChar32;
    349        args->converter->fromUChar32 = 0;
    350        goto lowsurogate;
    351    }
    352 
    353    while (mySource < sourceLimit && myTarget < targetLimit) {
    354        ch = *(mySource++);
    355 
    356        if (U_IS_SURROGATE(ch)) {
    357            if (U_IS_LEAD(ch)) {
    358 lowsurogate:
    359                if (mySource < sourceLimit) {
    360                    ch2 = *mySource;
    361                    if (U_IS_TRAIL(ch2)) {
    362                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    363                        mySource++;
    364                    }
    365                    else {
    366                        /* this is an unmatched trail code unit (2nd surrogate) */
    367                        /* callback(illegal) */
    368                        args->converter->fromUChar32 = ch;
    369                        *err = U_ILLEGAL_CHAR_FOUND;
    370                        break;
    371                    }
    372                }
    373                else {
    374                    /* ran out of source */
    375                    args->converter->fromUChar32 = ch;
    376                    if (args->flush) {
    377                        /* this is an unmatched trail code unit (2nd surrogate) */
    378                        /* callback(illegal) */
    379                        *err = U_ILLEGAL_CHAR_FOUND;
    380                    }
    381                    break;
    382                }
    383            }
    384            else {
    385                /* this is an unmatched trail code unit (2nd surrogate) */
    386                /* callback(illegal) */
    387                args->converter->fromUChar32 = ch;
    388                *err = U_ILLEGAL_CHAR_FOUND;
    389                break;
    390            }
    391        }
    392 
    393        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    394        temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    395        temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    396        temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    397 
    398        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    399            if (myTarget < targetLimit) {
    400                *(myTarget++) = temp[indexToWrite];
    401                *(myOffsets++) = offsetNum;
    402            }
    403            else {
    404                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    405                *err = U_BUFFER_OVERFLOW_ERROR;
    406            }
    407        }
    408        offsetNum = offsetNum + 1 + (temp[1] != 0);
    409    }
    410 
    411    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    412        *err = U_BUFFER_OVERFLOW_ERROR;
    413    }
    414 
    415    args->target = (char *) myTarget;
    416    args->source = mySource;
    417    args->offsets = myOffsets;
    418 }
    419 
    420 static UChar32 U_CALLCONV
    421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
    422                                   UErrorCode* err)
    423 {
    424    const uint8_t *mySource;
    425    UChar32 myUChar;
    426    int32_t length;
    427 
    428    mySource = (const uint8_t *)args->source;
    429    if (mySource >= (const uint8_t *)args->sourceLimit)
    430    {
    431        /* no input */
    432        *err = U_INDEX_OUTOFBOUNDS_ERROR;
    433        return 0xffff;
    434    }
    435 
    436    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    437    if (length < 4) 
    438    {
    439        /* got a partial character */
    440        uprv_memcpy(args->converter->toUBytes, mySource, length);
    441        args->converter->toULength = (int8_t)length;
    442        args->source = (const char *)(mySource + length);
    443        *err = U_TRUNCATED_CHAR_FOUND;
    444        return 0xffff;
    445    }
    446 
    447    /* Don't even try to do a direct cast because the value may be on an odd address. */
    448    myUChar = ((UChar32)mySource[0] << 24)
    449            | ((UChar32)mySource[1] << 16)
    450            | ((UChar32)mySource[2] << 8)
    451            | ((UChar32)mySource[3]);
    452 
    453    args->source = (const char *)(mySource + 4);
    454    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    455        return myUChar;
    456    }
    457 
    458    uprv_memcpy(args->converter->toUBytes, mySource, 4);
    459    args->converter->toULength = 4;
    460 
    461    *err = U_ILLEGAL_CHAR_FOUND;
    462    return 0xffff;
    463 }
    464 U_CDECL_END
    465 static const UConverterImpl _UTF32BEImpl = {
    466    UCNV_UTF32_BigEndian,
    467 
    468    nullptr,
    469    nullptr,
    470 
    471    nullptr,
    472    nullptr,
    473    nullptr,
    474 
    475    T_UConverter_toUnicode_UTF32_BE,
    476    T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
    477    T_UConverter_fromUnicode_UTF32_BE,
    478    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
    479    T_UConverter_getNextUChar_UTF32_BE,
    480 
    481    nullptr,
    482    nullptr,
    483    nullptr,
    484    nullptr,
    485    ucnv_getNonSurrogateUnicodeSet,
    486 
    487    nullptr,
    488    nullptr
    489 };
    490 
    491 /* The 1232 CCSID refers to any version of Unicode with any endianness of UTF-32 */
    492 static const UConverterStaticData _UTF32BEStaticData = {
    493    sizeof(UConverterStaticData),
    494    "UTF-32BE",
    495    1232,
    496    UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    497    { 0, 0, 0xff, 0xfd }, 4, false, false,
    498    0,
    499    0,
    500    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    501 };
    502 
    503 const UConverterSharedData _UTF32BEData =
    504        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
    505 
    506 /* UTF-32LE ---------------------------------------------------------- */
    507 U_CDECL_BEGIN
    508 static void U_CALLCONV
    509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
    510                                UErrorCode * err)
    511 {
    512    const unsigned char *mySource = (unsigned char *) args->source;
    513    char16_t *myTarget = args->target;
    514    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    515    const char16_t *targetLimit = args->targetLimit;
    516    unsigned char *toUBytes = args->converter->toUBytes;
    517    uint32_t ch, i;
    518 
    519    /* Restore state of current sequence */
    520    if (args->converter->toULength > 0 && myTarget < targetLimit)
    521    {
    522        i = args->converter->toULength;       /* restore # of bytes consumed */
    523        args->converter->toULength = 0;
    524 
    525        /* Stores the previously calculated ch from a previous call*/
    526        ch = args->converter->toUnicodeStatus - 1;
    527        args->converter->toUnicodeStatus = 0;
    528        goto morebytes;
    529    }
    530 
    531    while (mySource < sourceLimit && myTarget < targetLimit)
    532    {
    533        i = 0;
    534        ch = 0;
    535 morebytes:
    536        while (i < sizeof(uint32_t))
    537        {
    538            if (mySource < sourceLimit)
    539            {
    540                ch |= ((uint8_t)(*mySource)) << (i * 8);
    541                toUBytes[i++] = (char) *(mySource++);
    542            }
    543            else
    544            {
    545                /* stores a partially calculated target*/
    546                /* + 1 to make 0 a valid character */
    547                args->converter->toUnicodeStatus = ch + 1;
    548                args->converter->toULength = (int8_t) i;
    549                goto donefornow;
    550            }
    551        }
    552 
    553        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    554            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    555            if (ch <= MAXIMUM_UCS2) {
    556                /* fits in 16 bits */
    557                *(myTarget++) = (char16_t) ch;
    558            }
    559            else {
    560                /* write out the surrogates */
    561                *(myTarget++) = U16_LEAD(ch);
    562                ch = U16_TRAIL(ch);
    563                if (myTarget < targetLimit) {
    564                    *(myTarget++) = (char16_t)ch;
    565                }
    566                else {
    567                    /* Put in overflow buffer (not handled here) */
    568                    args->converter->UCharErrorBuffer[0] = (char16_t) ch;
    569                    args->converter->UCharErrorBufferLength = 1;
    570                    *err = U_BUFFER_OVERFLOW_ERROR;
    571                    break;
    572                }
    573            }
    574        }
    575        else {
    576            args->converter->toULength = (int8_t)i;
    577            *err = U_ILLEGAL_CHAR_FOUND;
    578            break;
    579        }
    580    }
    581 
    582 donefornow:
    583    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    584    {
    585        /* End of target buffer */
    586        *err = U_BUFFER_OVERFLOW_ERROR;
    587    }
    588 
    589    args->target = myTarget;
    590    args->source = (const char *) mySource;
    591 }
    592 
    593 static void U_CALLCONV
    594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    595                                             UErrorCode * err)
    596 {
    597    const unsigned char *mySource = (unsigned char *) args->source;
    598    char16_t *myTarget = args->target;
    599    int32_t *myOffsets = args->offsets;
    600    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    601    const char16_t *targetLimit = args->targetLimit;
    602    unsigned char *toUBytes = args->converter->toUBytes;
    603    uint32_t ch, i;
    604    int32_t offsetNum = 0;
    605 
    606    /* Restore state of current sequence */
    607    if (args->converter->toULength > 0 && myTarget < targetLimit)
    608    {
    609        i = args->converter->toULength;       /* restore # of bytes consumed */
    610        args->converter->toULength = 0;
    611 
    612        /* Stores the previously calculated ch from a previous call*/
    613        ch = args->converter->toUnicodeStatus - 1;
    614        args->converter->toUnicodeStatus = 0;
    615        goto morebytes;
    616    }
    617 
    618    while (mySource < sourceLimit && myTarget < targetLimit)
    619    {
    620        i = 0;
    621        ch = 0;
    622 morebytes:
    623        while (i < sizeof(uint32_t))
    624        {
    625            if (mySource < sourceLimit)
    626            {
    627                ch |= ((uint8_t)(*mySource)) << (i * 8);
    628                toUBytes[i++] = (char) *(mySource++);
    629            }
    630            else
    631            {
    632                /* stores a partially calculated target*/
    633                /* + 1 to make 0 a valid character */
    634                args->converter->toUnicodeStatus = ch + 1;
    635                args->converter->toULength = (int8_t) i;
    636                goto donefornow;
    637            }
    638        }
    639 
    640        if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
    641        {
    642            /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    643            if (ch <= MAXIMUM_UCS2) 
    644            {
    645                /* fits in 16 bits */
    646                *(myTarget++) = (char16_t) ch;
    647                *(myOffsets++) = offsetNum;
    648            }
    649            else {
    650                /* write out the surrogates */
    651                *(myTarget++) = U16_LEAD(ch);
    652                *(myOffsets++) = offsetNum;
    653                ch = U16_TRAIL(ch);
    654                if (myTarget < targetLimit)
    655                {
    656                    *(myTarget++) = (char16_t)ch;
    657                    *(myOffsets++) = offsetNum;
    658                }
    659                else
    660                {
    661                    /* Put in overflow buffer (not handled here) */
    662                    args->converter->UCharErrorBuffer[0] = (char16_t) ch;
    663                    args->converter->UCharErrorBufferLength = 1;
    664                    *err = U_BUFFER_OVERFLOW_ERROR;
    665                    break;
    666                }
    667            }
    668        }
    669        else
    670        {
    671            args->converter->toULength = (int8_t)i;
    672            *err = U_ILLEGAL_CHAR_FOUND;
    673            break;
    674        }
    675        offsetNum += i;
    676    }
    677 
    678 donefornow:
    679    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    680    {
    681        /* End of target buffer */
    682        *err = U_BUFFER_OVERFLOW_ERROR;
    683    }
    684 
    685    args->target = myTarget;
    686    args->source = (const char *) mySource;
    687    args->offsets = myOffsets;
    688 }
    689 
    690 static void U_CALLCONV
    691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
    692                                  UErrorCode * err)
    693 {
    694    const char16_t *mySource = args->source;
    695    unsigned char *myTarget;
    696    const char16_t *sourceLimit = args->sourceLimit;
    697    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    698    UChar32 ch, ch2;
    699    unsigned int indexToWrite;
    700    unsigned char temp[sizeof(uint32_t)];
    701 
    702    if(mySource >= sourceLimit) {
    703        /* no input, nothing to do */
    704        return;
    705    }
    706 
    707    /* write the BOM if necessary */
    708    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    709        static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
    710        ucnv_fromUWriteBytes(args->converter,
    711                             bom, 4,
    712                             &args->target, args->targetLimit,
    713                             &args->offsets, -1,
    714                             err);
    715        args->converter->fromUnicodeStatus=0;
    716    }
    717 
    718    myTarget = (unsigned char *) args->target;
    719    temp[3] = 0;
    720 
    721    if (args->converter->fromUChar32)
    722    {
    723        ch = args->converter->fromUChar32;
    724        args->converter->fromUChar32 = 0;
    725        goto lowsurogate;
    726    }
    727 
    728    while (mySource < sourceLimit && myTarget < targetLimit)
    729    {
    730        ch = *(mySource++);
    731 
    732        if (U16_IS_SURROGATE(ch)) {
    733            if (U16_IS_LEAD(ch))
    734            {
    735 lowsurogate:
    736                if (mySource < sourceLimit)
    737                {
    738                    ch2 = *mySource;
    739                    if (U16_IS_TRAIL(ch2)) {
    740                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    741                        mySource++;
    742                    }
    743                    else {
    744                        /* this is an unmatched trail code unit (2nd surrogate) */
    745                        /* callback(illegal) */
    746                        args->converter->fromUChar32 = ch;
    747                        *err = U_ILLEGAL_CHAR_FOUND;
    748                        break;
    749                    }
    750                }
    751                else {
    752                    /* ran out of source */
    753                    args->converter->fromUChar32 = ch;
    754                    if (args->flush) {
    755                        /* this is an unmatched trail code unit (2nd surrogate) */
    756                        /* callback(illegal) */
    757                        *err = U_ILLEGAL_CHAR_FOUND;
    758                    }
    759                    break;
    760                }
    761            }
    762            else {
    763                /* this is an unmatched trail code unit (2nd surrogate) */
    764                /* callback(illegal) */
    765                args->converter->fromUChar32 = ch;
    766                *err = U_ILLEGAL_CHAR_FOUND;
    767                break;
    768            }
    769        }
    770 
    771        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    772        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    773        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    774        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    775 
    776        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    777        {
    778            if (myTarget < targetLimit)
    779            {
    780                *(myTarget++) = temp[indexToWrite];
    781            }
    782            else
    783            {
    784                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    785                *err = U_BUFFER_OVERFLOW_ERROR;
    786            }
    787        }
    788    }
    789 
    790    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    791    {
    792        *err = U_BUFFER_OVERFLOW_ERROR;
    793    }
    794 
    795    args->target = (char *) myTarget;
    796    args->source = mySource;
    797 }
    798 
    799 static void U_CALLCONV
    800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    801                                               UErrorCode * err)
    802 {
    803    const char16_t *mySource = args->source;
    804    unsigned char *myTarget;
    805    int32_t *myOffsets;
    806    const char16_t *sourceLimit = args->sourceLimit;
    807    const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    808    UChar32 ch, ch2;
    809    unsigned int indexToWrite;
    810    unsigned char temp[sizeof(uint32_t)];
    811    int32_t offsetNum = 0;
    812 
    813    if(mySource >= sourceLimit) {
    814        /* no input, nothing to do */
    815        return;
    816    }
    817 
    818    /* write the BOM if necessary */
    819    if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    820        static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 };
    821        ucnv_fromUWriteBytes(args->converter,
    822                             bom, 4,
    823                             &args->target, args->targetLimit,
    824                             &args->offsets, -1,
    825                             err);
    826        args->converter->fromUnicodeStatus=0;
    827    }
    828 
    829    myTarget = (unsigned char *) args->target;
    830    myOffsets = args->offsets;
    831    temp[3] = 0;
    832 
    833    if (args->converter->fromUChar32)
    834    {
    835        ch = args->converter->fromUChar32;
    836        args->converter->fromUChar32 = 0;
    837        goto lowsurogate;
    838    }
    839 
    840    while (mySource < sourceLimit && myTarget < targetLimit)
    841    {
    842        ch = *(mySource++);
    843 
    844        if (U16_IS_SURROGATE(ch)) {
    845            if (U16_IS_LEAD(ch))
    846            {
    847 lowsurogate:
    848                if (mySource < sourceLimit)
    849                {
    850                    ch2 = *mySource;
    851                    if (U16_IS_TRAIL(ch2))
    852                    {
    853                        ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    854                        mySource++;
    855                    }
    856                    else {
    857                        /* this is an unmatched trail code unit (2nd surrogate) */
    858                        /* callback(illegal) */
    859                        args->converter->fromUChar32 = ch;
    860                        *err = U_ILLEGAL_CHAR_FOUND;
    861                        break;
    862                    }
    863                }
    864                else {
    865                    /* ran out of source */
    866                    args->converter->fromUChar32 = ch;
    867                    if (args->flush) {
    868                        /* this is an unmatched trail code unit (2nd surrogate) */
    869                        /* callback(illegal) */
    870                        *err = U_ILLEGAL_CHAR_FOUND;
    871                    }
    872                    break;
    873                }
    874            }
    875            else {
    876                /* this is an unmatched trail code unit (2nd surrogate) */
    877                /* callback(illegal) */
    878                args->converter->fromUChar32 = ch;
    879                *err = U_ILLEGAL_CHAR_FOUND;
    880                break;
    881            }
    882        }
    883 
    884        /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    885        temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    886        temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    887        temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    888 
    889        for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    890        {
    891            if (myTarget < targetLimit)
    892            {
    893                *(myTarget++) = temp[indexToWrite];
    894                *(myOffsets++) = offsetNum;
    895            }
    896            else
    897            {
    898                args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    899                *err = U_BUFFER_OVERFLOW_ERROR;
    900            }
    901        }
    902        offsetNum = offsetNum + 1 + (temp[2] != 0);
    903    }
    904 
    905    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    906    {
    907        *err = U_BUFFER_OVERFLOW_ERROR;
    908    }
    909 
    910    args->target = (char *) myTarget;
    911    args->source = mySource;
    912    args->offsets = myOffsets;
    913 }
    914 
    915 static UChar32 U_CALLCONV
    916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
    917                                   UErrorCode* err)
    918 {
    919    const uint8_t *mySource;
    920    UChar32 myUChar;
    921    int32_t length;
    922 
    923    mySource = (const uint8_t *)args->source;
    924    if (mySource >= (const uint8_t *)args->sourceLimit)
    925    {
    926        /* no input */
    927        *err = U_INDEX_OUTOFBOUNDS_ERROR;
    928        return 0xffff;
    929    }
    930 
    931    length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    932    if (length < 4) 
    933    {
    934        /* got a partial character */
    935        uprv_memcpy(args->converter->toUBytes, mySource, length);
    936        args->converter->toULength = (int8_t)length;
    937        args->source = (const char *)(mySource + length);
    938        *err = U_TRUNCATED_CHAR_FOUND;
    939        return 0xffff;
    940    }
    941 
    942    /* Don't even try to do a direct cast because the value may be on an odd address. */
    943    myUChar = ((UChar32)mySource[3] << 24)
    944            | ((UChar32)mySource[2] << 16)
    945            | ((UChar32)mySource[1] << 8)
    946            | ((UChar32)mySource[0]);
    947 
    948    args->source = (const char *)(mySource + 4);
    949    if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    950        return myUChar;
    951    }
    952 
    953    uprv_memcpy(args->converter->toUBytes, mySource, 4);
    954    args->converter->toULength = 4;
    955 
    956    *err = U_ILLEGAL_CHAR_FOUND;
    957    return 0xffff;
    958 }
    959 U_CDECL_END
    960 static const UConverterImpl _UTF32LEImpl = {
    961    UCNV_UTF32_LittleEndian,
    962 
    963    nullptr,
    964    nullptr,
    965 
    966    nullptr,
    967    nullptr,
    968    nullptr,
    969 
    970    T_UConverter_toUnicode_UTF32_LE,
    971    T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
    972    T_UConverter_fromUnicode_UTF32_LE,
    973    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
    974    T_UConverter_getNextUChar_UTF32_LE,
    975 
    976    nullptr,
    977    nullptr,
    978    nullptr,
    979    nullptr,
    980    ucnv_getNonSurrogateUnicodeSet,
    981 
    982    nullptr,
    983    nullptr
    984 };
    985 
    986 /* The 1232 CCSID refers to any version of Unicode with any endianness of UTF-32 */
    987 static const UConverterStaticData _UTF32LEStaticData = {
    988    sizeof(UConverterStaticData),
    989    "UTF-32LE",
    990    1234,
    991    UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
    992    { 0xfd, 0xff, 0, 0 }, 4, false, false,
    993    0,
    994    0,
    995    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    996 };
    997 
    998 
    999 const UConverterSharedData _UTF32LEData =
   1000        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
   1001 
   1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
   1003 
   1004 /*
   1005 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
   1006 * accordingly.
   1007 *
   1008 * State values:
   1009 * 0    initial state
   1010 * 1    saw 00
   1011 * 2    saw 00 00
   1012 * 3    saw 00 00 FE
   1013 * 4    -
   1014 * 5    saw FF
   1015 * 6    saw FF FE
   1016 * 7    saw FF FE 00
   1017 * 8    UTF-32BE mode
   1018 * 9    UTF-32LE mode
   1019 *
   1020 * During detection: state&3==number of matching bytes so far.
   1021 *
   1022 * On output, emit U+FEFF as the first code point.
   1023 */
   1024 U_CDECL_BEGIN
   1025 static void U_CALLCONV
   1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
   1027    if(choice<=UCNV_RESET_TO_UNICODE) {
   1028        /* reset toUnicode: state=0 */
   1029        cnv->mode=0;
   1030    }
   1031    if(choice!=UCNV_RESET_TO_UNICODE) {
   1032        /* reset fromUnicode: prepare to output the UTF-32PE BOM */
   1033        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1034    }
   1035 }
   1036 
   1037 static void U_CALLCONV
   1038 _UTF32Open(UConverter *cnv,
   1039           UConverterLoadArgs *pArgs,
   1040           UErrorCode *pErrorCode) {
   1041    (void)pArgs;
   1042    (void)pErrorCode;
   1043    _UTF32Reset(cnv, UCNV_RESET_BOTH);
   1044 }
   1045 
   1046 static const char utf32BOM[8]={ 0, 0, (char)0xfeu, (char)0xffu, (char)0xffu, (char)0xfeu, 0, 0 };
   1047 
   1048 static void U_CALLCONV
   1049 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1050                           UErrorCode *pErrorCode) {
   1051    UConverter *cnv=pArgs->converter;
   1052    const char *source=pArgs->source;
   1053    const char *sourceLimit=pArgs->sourceLimit;
   1054    int32_t *offsets=pArgs->offsets;
   1055 
   1056    int32_t state, offsetDelta;
   1057    char b;
   1058 
   1059    state=cnv->mode;
   1060 
   1061    /*
   1062     * If we detect a BOM in this buffer, then we must add the BOM size to the
   1063     * offsets because the actual converter function will not see and count the BOM.
   1064     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1065     */
   1066    offsetDelta=0;
   1067 
   1068    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1069        switch(state) {
   1070        case 0:
   1071            b=*source;
   1072            if(b==0) {
   1073                state=1; /* could be 00 00 FE FF */
   1074            } else if(b==(char)0xffu) {
   1075                state=5; /* could be FF FE 00 00 */
   1076            } else {
   1077                state=8; /* default to UTF-32BE */
   1078                continue;
   1079            }
   1080            ++source;
   1081            break;
   1082        case 1:
   1083        case 2:
   1084        case 3:
   1085        case 5:
   1086        case 6:
   1087        case 7:
   1088            if(*source==utf32BOM[state]) {
   1089                ++state;
   1090                ++source;
   1091                if(state==4) {
   1092                    state=8; /* detect UTF-32BE */
   1093                    offsetDelta=(int32_t)(source-pArgs->source);
   1094                } else if(state==8) {
   1095                    state=9; /* detect UTF-32LE */
   1096                    offsetDelta=(int32_t)(source-pArgs->source);
   1097                }
   1098            } else {
   1099                /* switch to UTF-32BE and pass the previous bytes */
   1100                int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
   1101 
   1102                /* reset the source */
   1103                source=pArgs->source;
   1104 
   1105                if(count==(state&3)) {
   1106                    /* simple: all in the same buffer, just reset source */
   1107                } else {
   1108                    UBool oldFlush=pArgs->flush;
   1109 
   1110                    /* some of the bytes are from a previous buffer, replay those first */
   1111                    pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1112                    pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
   1113                    pArgs->flush=false; /* this sourceLimit is not the real source stream limit */
   1114 
   1115                    /* no offsets: bytes from previous buffer, and not enough for output */
   1116                    T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1117 
   1118                    /* restore real pointers; pArgs->source will be set in case 8/9 */
   1119                    pArgs->sourceLimit=sourceLimit;
   1120                    pArgs->flush=oldFlush;
   1121                }
   1122                state=8;
   1123                continue;
   1124            }
   1125            break;
   1126        case 8:
   1127            /* call UTF-32BE */
   1128            pArgs->source=source;
   1129            if(offsets==nullptr) {
   1130                T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1131            } else {
   1132                T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
   1133            }
   1134            source=pArgs->source;
   1135            break;
   1136        case 9:
   1137            /* call UTF-32LE */
   1138            pArgs->source=source;
   1139            if(offsets==nullptr) {
   1140                T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1141            } else {
   1142                T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
   1143            }
   1144            source=pArgs->source;
   1145            break;
   1146        default:
   1147            break; /* does not occur */
   1148        }
   1149    }
   1150 
   1151    /* add BOM size to offsets - see comment at offsetDelta declaration */
   1152    if(offsets!=nullptr && offsetDelta!=0) {
   1153        int32_t *offsetsLimit=pArgs->offsets;
   1154        while(offsets<offsetsLimit) {
   1155            *offsets++ += offsetDelta;
   1156        }
   1157    }
   1158 
   1159    pArgs->source=source;
   1160 
   1161    if(source==sourceLimit && pArgs->flush) {
   1162        /* handle truncated input */
   1163        switch(state) {
   1164        case 0:
   1165            break; /* no input at all, nothing to do */
   1166        case 8:
   1167            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1168            break;
   1169        case 9:
   1170            T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1171            break;
   1172        default:
   1173            /* handle 0<state<8: call UTF-32BE with too-short input */
   1174            pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1175            pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
   1176 
   1177            /* no offsets: not enough for output */
   1178            T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1179            pArgs->source=source;
   1180            pArgs->sourceLimit=sourceLimit;
   1181            state=8;
   1182            break;
   1183        }
   1184    }
   1185 
   1186    cnv->mode=state;
   1187 }
   1188 
   1189 static UChar32 U_CALLCONV
   1190 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1191                   UErrorCode *pErrorCode) {
   1192    switch(pArgs->converter->mode) {
   1193    case 8:
   1194        return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
   1195    case 9:
   1196        return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
   1197    default:
   1198        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1199    }
   1200 }
   1201 U_CDECL_END
   1202 static const UConverterImpl _UTF32Impl = {
   1203    UCNV_UTF32,
   1204 
   1205    nullptr,
   1206    nullptr,
   1207 
   1208    _UTF32Open,
   1209    nullptr,
   1210    _UTF32Reset,
   1211 
   1212    _UTF32ToUnicodeWithOffsets,
   1213    _UTF32ToUnicodeWithOffsets,
   1214 #if U_IS_BIG_ENDIAN
   1215    T_UConverter_fromUnicode_UTF32_BE,
   1216    T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
   1217 #else
   1218    T_UConverter_fromUnicode_UTF32_LE,
   1219    T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
   1220 #endif
   1221    _UTF32GetNextUChar,
   1222 
   1223    nullptr, /* ### TODO implement getStarters for all Unicode encodings?! */
   1224    nullptr,
   1225    nullptr,
   1226    nullptr,
   1227    ucnv_getNonSurrogateUnicodeSet,
   1228 
   1229    nullptr,
   1230    nullptr
   1231 };
   1232 
   1233 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianness of UTF-32 */
   1234 static const UConverterStaticData _UTF32StaticData = {
   1235    sizeof(UConverterStaticData),
   1236    "UTF-32",
   1237    1236,
   1238    UCNV_IBM, UCNV_UTF32, 4, 4,
   1239 #if U_IS_BIG_ENDIAN
   1240    { 0, 0, 0xff, 0xfd }, 4,
   1241 #else
   1242    { 0xfd, 0xff, 0, 0 }, 4,
   1243 #endif
   1244    false, false,
   1245    0,
   1246    0,
   1247    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1248 };
   1249 
   1250 const UConverterSharedData _UTF32Data = 
   1251        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
   1252 
   1253 #endif