[ tor-browser ].git.dasho

ucnv_u8.cpp (31015B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*  
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u8.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
     17 *
     18 *   Also, CESU-8 implementation, see UTR 26.
     19 *   The CESU-8 converter uses all the same functions as the
     20 *   UTF-8 converter, with a branch for converting supplementary code points.
     21 */
     22 
     23 #include "unicode/utypes.h"
     24 
     25 #if !UCONFIG_NO_CONVERSION
     26 
     27 #include "unicode/ucnv.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf8.h"
     30 #include "unicode/utf16.h"
     31 #include "uassert.h"
     32 #include "ucnv_bld.h"
     33 #include "ucnv_cnv.h"
     34 #include "cmemory.h"
     35 #include "ustr_imp.h"
     36 
     37 /* Prototypes --------------------------------------------------------------- */
     38 
     39 /* Keep these here to make finicky compilers happy */
     40 
     41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
     42                                           UErrorCode *err);
     43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
     44                                                        UErrorCode *err);
     45 
     46 
     47 /* UTF-8 -------------------------------------------------------------------- */
     48 
     49 #define MAXIMUM_UCS2            0x0000FFFF
     50 
     51 static const uint32_t offsetsFromUTF8[5] = {0,
     52  static_cast<uint32_t>(0x00000000), static_cast<uint32_t>(0x00003080),
     53  static_cast<uint32_t>(0x000E2080), static_cast<uint32_t>(0x03C82080)
     54 };
     55 
     56 static UBool hasCESU8Data(const UConverter *cnv)
     57 {
     58 #if UCONFIG_ONLY_HTML_CONVERSION
     59    return false;
     60 #else
     61    return cnv->sharedData == &_CESU8Data;
     62 #endif
     63 }
     64 U_CDECL_BEGIN
     65 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
     66                                  UErrorCode * err)
     67 {
     68    UConverter *cnv = args->converter;
     69    const unsigned char *mySource = (unsigned char *) args->source;
     70    char16_t *myTarget = args->target;
     71    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     72    const char16_t *targetLimit = args->targetLimit;
     73    unsigned char *toUBytes = cnv->toUBytes;
     74    UBool isCESU8 = hasCESU8Data(cnv);
     75    uint32_t ch, ch2 = 0;
     76    int32_t i, inBytes;
     77 
     78    /* Restore size of current sequence */
     79    if (cnv->toULength > 0 && myTarget < targetLimit)
     80    {
     81        inBytes = cnv->mode;            /* restore # of bytes to consume */
     82        i = cnv->toULength;             /* restore # of bytes consumed */
     83        cnv->toULength = 0;
     84 
     85        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
     86        cnv->toUnicodeStatus = 0;
     87        goto morebytes;
     88    }
     89 
     90 
     91    while (mySource < sourceLimit && myTarget < targetLimit)
     92    {
     93        ch = *(mySource++);
     94        if (U8_IS_SINGLE(ch))        /* Simple case */
     95        {
     96            *(myTarget++) = (char16_t) ch;
     97        }
     98        else
     99        {
    100            /* store the first char */
    101            toUBytes[0] = (char)ch;
    102            inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
    103            i = 1;
    104 
    105 morebytes:
    106            while (i < inBytes)
    107            {
    108                if (mySource < sourceLimit)
    109                {
    110                    toUBytes[i] = (char) (ch2 = *mySource);
    111                    if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
    112                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
    113                    {
    114                        break; /* i < inBytes */
    115                    }
    116                    ch = (ch << 6) + ch2;
    117                    ++mySource;
    118                    i++;
    119                }
    120                else
    121                {
    122                    /* stores a partially calculated target*/
    123                    cnv->toUnicodeStatus = ch;
    124                    cnv->mode = inBytes;
    125                    cnv->toULength = (int8_t) i;
    126                    goto donefornow;
    127                }
    128            }
    129 
    130            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    131            if (i == inBytes && (!isCESU8 || i <= 3))
    132            {
    133                /* Remove the accumulated high bits */
    134                ch -= offsetsFromUTF8[inBytes];
    135 
    136                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    137                if (ch <= MAXIMUM_UCS2) 
    138                {
    139                    /* fits in 16 bits */
    140                    *(myTarget++) = (char16_t) ch;
    141                }
    142                else
    143                {
    144                    /* write out the surrogates */
    145                    *(myTarget++) = U16_LEAD(ch);
    146                    ch = U16_TRAIL(ch);
    147                    if (myTarget < targetLimit)
    148                    {
    149                        *(myTarget++) = (char16_t)ch;
    150                    }
    151                    else
    152                    {
    153                        /* Put in overflow buffer (not handled here) */
    154                        cnv->UCharErrorBuffer[0] = (char16_t) ch;
    155                        cnv->UCharErrorBufferLength = 1;
    156                        *err = U_BUFFER_OVERFLOW_ERROR;
    157                        break;
    158                    }
    159                }
    160            }
    161            else
    162            {
    163                cnv->toULength = (int8_t)i;
    164                *err = U_ILLEGAL_CHAR_FOUND;
    165                break;
    166            }
    167        }
    168    }
    169 
    170 donefornow:
    171    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    172    {
    173        /* End of target buffer */
    174        *err = U_BUFFER_OVERFLOW_ERROR;
    175    }
    176 
    177    args->target = myTarget;
    178    args->source = (const char *) mySource;
    179 }
    180 
    181 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
    182                                                UErrorCode * err)
    183 {
    184    UConverter *cnv = args->converter;
    185    const unsigned char *mySource = (unsigned char *) args->source;
    186    char16_t *myTarget = args->target;
    187    int32_t *myOffsets = args->offsets;
    188    int32_t offsetNum = 0;
    189    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    190    const char16_t *targetLimit = args->targetLimit;
    191    unsigned char *toUBytes = cnv->toUBytes;
    192    UBool isCESU8 = hasCESU8Data(cnv);
    193    uint32_t ch, ch2 = 0;
    194    int32_t i, inBytes;
    195 
    196    /* Restore size of current sequence */
    197    if (cnv->toULength > 0 && myTarget < targetLimit)
    198    {
    199        inBytes = cnv->mode;            /* restore # of bytes to consume */
    200        i = cnv->toULength;             /* restore # of bytes consumed */
    201        cnv->toULength = 0;
    202 
    203        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    204        cnv->toUnicodeStatus = 0;
    205        goto morebytes;
    206    }
    207 
    208    while (mySource < sourceLimit && myTarget < targetLimit)
    209    {
    210        ch = *(mySource++);
    211        if (U8_IS_SINGLE(ch))        /* Simple case */
    212        {
    213            *(myTarget++) = (char16_t) ch;
    214            *(myOffsets++) = offsetNum++;
    215        }
    216        else
    217        {
    218            toUBytes[0] = (char)ch;
    219            inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
    220            i = 1;
    221 
    222 morebytes:
    223            while (i < inBytes)
    224            {
    225                if (mySource < sourceLimit)
    226                {
    227                    toUBytes[i] = (char) (ch2 = *mySource);
    228                    if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
    229                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
    230                    {
    231                        break; /* i < inBytes */
    232                    }
    233                    ch = (ch << 6) + ch2;
    234                    ++mySource;
    235                    i++;
    236                }
    237                else
    238                {
    239                    cnv->toUnicodeStatus = ch;
    240                    cnv->mode = inBytes;
    241                    cnv->toULength = (int8_t)i;
    242                    goto donefornow;
    243                }
    244            }
    245 
    246            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    247            if (i == inBytes && (!isCESU8 || i <= 3))
    248            {
    249                /* Remove the accumulated high bits */
    250                ch -= offsetsFromUTF8[inBytes];
    251 
    252                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    253                if (ch <= MAXIMUM_UCS2) 
    254                {
    255                    /* fits in 16 bits */
    256                    *(myTarget++) = (char16_t) ch;
    257                    *(myOffsets++) = offsetNum;
    258                }
    259                else
    260                {
    261                    /* write out the surrogates */
    262                    *(myTarget++) = U16_LEAD(ch);
    263                    *(myOffsets++) = offsetNum;
    264                    ch = U16_TRAIL(ch);
    265                    if (myTarget < targetLimit)
    266                    {
    267                        *(myTarget++) = (char16_t)ch;
    268                        *(myOffsets++) = offsetNum;
    269                    }
    270                    else
    271                    {
    272                        cnv->UCharErrorBuffer[0] = (char16_t) ch;
    273                        cnv->UCharErrorBufferLength = 1;
    274                        *err = U_BUFFER_OVERFLOW_ERROR;
    275                    }
    276                }
    277                offsetNum += i;
    278            }
    279            else
    280            {
    281                cnv->toULength = (int8_t)i;
    282                *err = U_ILLEGAL_CHAR_FOUND;
    283                break;
    284            }
    285        }
    286    }
    287 
    288 donefornow:
    289    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    290    {   /* End of target buffer */
    291        *err = U_BUFFER_OVERFLOW_ERROR;
    292    }
    293 
    294    args->target = myTarget;
    295    args->source = (const char *) mySource;
    296    args->offsets = myOffsets;
    297 }
    298 U_CDECL_END
    299 
    300 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
    301                                    UErrorCode * err)
    302 {
    303    UConverter *cnv = args->converter;
    304    const char16_t *mySource = args->source;
    305    const char16_t *sourceLimit = args->sourceLimit;
    306    uint8_t *myTarget = (uint8_t *) args->target;
    307    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    308    uint8_t *tempPtr;
    309    UChar32 ch;
    310    uint8_t tempBuf[4];
    311    int32_t indexToWrite;
    312    UBool isNotCESU8 = !hasCESU8Data(cnv);
    313 
    314    if (cnv->fromUChar32 && myTarget < targetLimit)
    315    {
    316        ch = cnv->fromUChar32;
    317        cnv->fromUChar32 = 0;
    318        goto lowsurrogate;
    319    }
    320 
    321    while (mySource < sourceLimit && myTarget < targetLimit)
    322    {
    323        ch = *(mySource++);
    324 
    325        if (ch < 0x80)        /* Single byte */
    326        {
    327            *(myTarget++) = (uint8_t) ch;
    328        }
    329        else if (ch < 0x800)  /* Double byte */
    330        {
    331            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    332            if (myTarget < targetLimit)
    333            {
    334                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    335            }
    336            else
    337            {
    338                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    339                cnv->charErrorBufferLength = 1;
    340                *err = U_BUFFER_OVERFLOW_ERROR;
    341            }
    342        }
    343        else {
    344            /* Check for surrogates */
    345            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    346 lowsurrogate:
    347                if (mySource < sourceLimit) {
    348                    /* test both code units */
    349                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    350                        /* convert and consume this supplementary code point */
    351                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    352                        ++mySource;
    353                        /* exit this condition tree */
    354                    }
    355                    else {
    356                        /* this is an unpaired trail or lead code unit */
    357                        /* callback(illegal) */
    358                        cnv->fromUChar32 = ch;
    359                        *err = U_ILLEGAL_CHAR_FOUND;
    360                        break;
    361                    }
    362                }
    363                else {
    364                    /* no more input */
    365                    cnv->fromUChar32 = ch;
    366                    break;
    367                }
    368            }
    369 
    370            /* Do we write the buffer directly for speed,
    371            or do we have to be careful about target buffer space? */
    372            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    373 
    374            if (ch <= MAXIMUM_UCS2) {
    375                indexToWrite = 2;
    376                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    377            }
    378            else {
    379                indexToWrite = 3;
    380                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    381                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    382            }
    383            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    384            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    385 
    386            if (tempPtr == myTarget) {
    387                /* There was enough space to write the codepoint directly. */
    388                myTarget += (indexToWrite + 1);
    389            }
    390            else {
    391                /* We might run out of room soon. Write it slowly. */
    392                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    393                    if (myTarget < targetLimit) {
    394                        *(myTarget++) = *tempPtr;
    395                    }
    396                    else {
    397                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    398                        *err = U_BUFFER_OVERFLOW_ERROR;
    399                    }
    400                }
    401            }
    402        }
    403    }
    404 
    405    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    406    {
    407        *err = U_BUFFER_OVERFLOW_ERROR;
    408    }
    409 
    410    args->target = (char *) myTarget;
    411    args->source = mySource;
    412 }
    413 
    414 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
    415                                                  UErrorCode * err)
    416 {
    417    UConverter *cnv = args->converter;
    418    const char16_t *mySource = args->source;
    419    int32_t *myOffsets = args->offsets;
    420    const char16_t *sourceLimit = args->sourceLimit;
    421    uint8_t *myTarget = (uint8_t *) args->target;
    422    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    423    uint8_t *tempPtr;
    424    UChar32 ch;
    425    int32_t offsetNum, nextSourceIndex;
    426    int32_t indexToWrite;
    427    uint8_t tempBuf[4];
    428    UBool isNotCESU8 = !hasCESU8Data(cnv);
    429 
    430    if (cnv->fromUChar32 && myTarget < targetLimit)
    431    {
    432        ch = cnv->fromUChar32;
    433        cnv->fromUChar32 = 0;
    434        offsetNum = -1;
    435        nextSourceIndex = 0;
    436        goto lowsurrogate;
    437    } else {
    438        offsetNum = 0;
    439    }
    440 
    441    while (mySource < sourceLimit && myTarget < targetLimit)
    442    {
    443        ch = *(mySource++);
    444 
    445        if (ch < 0x80)        /* Single byte */
    446        {
    447            *(myOffsets++) = offsetNum++;
    448            *(myTarget++) = (char) ch;
    449        }
    450        else if (ch < 0x800)  /* Double byte */
    451        {
    452            *(myOffsets++) = offsetNum;
    453            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    454            if (myTarget < targetLimit)
    455            {
    456                *(myOffsets++) = offsetNum++;
    457                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    458            }
    459            else
    460            {
    461                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    462                cnv->charErrorBufferLength = 1;
    463                *err = U_BUFFER_OVERFLOW_ERROR;
    464            }
    465        }
    466        else
    467        /* Check for surrogates */
    468        {
    469            nextSourceIndex = offsetNum + 1;
    470 
    471            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    472 lowsurrogate:
    473                if (mySource < sourceLimit) {
    474                    /* test both code units */
    475                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    476                        /* convert and consume this supplementary code point */
    477                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    478                        ++mySource;
    479                        ++nextSourceIndex;
    480                        /* exit this condition tree */
    481                    }
    482                    else {
    483                        /* this is an unpaired trail or lead code unit */
    484                        /* callback(illegal) */
    485                        cnv->fromUChar32 = ch;
    486                        *err = U_ILLEGAL_CHAR_FOUND;
    487                        break;
    488                    }
    489                }
    490                else {
    491                    /* no more input */
    492                    cnv->fromUChar32 = ch;
    493                    break;
    494                }
    495            }
    496 
    497            /* Do we write the buffer directly for speed,
    498            or do we have to be careful about target buffer space? */
    499            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    500 
    501            if (ch <= MAXIMUM_UCS2) {
    502                indexToWrite = 2;
    503                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    504            }
    505            else {
    506                indexToWrite = 3;
    507                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    508                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    509            }
    510            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    511            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    512 
    513            if (tempPtr == myTarget) {
    514                /* There was enough space to write the codepoint directly. */
    515                myTarget += (indexToWrite + 1);
    516                myOffsets[0] = offsetNum;
    517                myOffsets[1] = offsetNum;
    518                myOffsets[2] = offsetNum;
    519                if (indexToWrite >= 3) {
    520                    myOffsets[3] = offsetNum;
    521                }
    522                myOffsets += (indexToWrite + 1);
    523            }
    524            else {
    525                /* We might run out of room soon. Write it slowly. */
    526                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    527                    if (myTarget < targetLimit)
    528                    {
    529                        *(myOffsets++) = offsetNum;
    530                        *(myTarget++) = *tempPtr;
    531                    }
    532                    else
    533                    {
    534                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    535                        *err = U_BUFFER_OVERFLOW_ERROR;
    536                    }
    537                }
    538            }
    539            offsetNum = nextSourceIndex;
    540        }
    541    }
    542 
    543    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    544    {
    545        *err = U_BUFFER_OVERFLOW_ERROR;
    546    }
    547 
    548    args->target = (char *) myTarget;
    549    args->source = mySource;
    550    args->offsets = myOffsets;
    551 }
    552 
    553 U_CDECL_BEGIN
    554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
    555                                               UErrorCode *err) {
    556    UConverter *cnv;
    557    const uint8_t *sourceInitial;
    558    const uint8_t *source;
    559    uint8_t myByte;
    560    UChar32 ch;
    561    int8_t i;
    562 
    563    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
    564 
    565    cnv = args->converter;
    566    sourceInitial = source = (const uint8_t *)args->source;
    567    if (source >= (const uint8_t *)args->sourceLimit)
    568    {
    569        /* no input */
    570        *err = U_INDEX_OUTOFBOUNDS_ERROR;
    571        return 0xffff;
    572    }
    573 
    574    myByte = *(source++);
    575    if (U8_IS_SINGLE(myByte))
    576    {
    577        args->source = (const char *)source;
    578        return (UChar32)myByte;
    579    }
    580 
    581    uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
    582    if (countTrailBytes == 0) {
    583        cnv->toUBytes[0] = myByte;
    584        cnv->toULength = 1;
    585        *err = U_ILLEGAL_CHAR_FOUND;
    586        args->source = (const char *)source;
    587        return 0xffff;
    588    }
    589 
    590    /*The byte sequence is longer than the buffer area passed*/
    591    if (((const char *)source + countTrailBytes) > args->sourceLimit)
    592    {
    593        /* check if all of the remaining bytes are trail bytes */
    594        uint16_t extraBytesToWrite = countTrailBytes + 1;
    595        cnv->toUBytes[0] = myByte;
    596        i = 1;
    597        *err = U_TRUNCATED_CHAR_FOUND;
    598        while(source < (const uint8_t *)args->sourceLimit) {
    599            uint8_t b = *source;
    600            if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
    601                cnv->toUBytes[i++] = b;
    602                ++source;
    603            } else {
    604                /* error even before we run out of input */
    605                *err = U_ILLEGAL_CHAR_FOUND;
    606                break;
    607            }
    608        }
    609        cnv->toULength = i;
    610        args->source = (const char *)source;
    611        return 0xffff;
    612    }
    613 
    614    ch = myByte << 6;
    615    if(countTrailBytes == 2) {
    616        uint8_t t1 = *source, t2;
    617        if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
    618            args->source = (const char *)(source + 1);
    619            return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
    620        }
    621    } else if(countTrailBytes == 1) {
    622        uint8_t t1 = *source;
    623        if(U8_IS_TRAIL(t1)) {
    624            args->source = (const char *)(source + 1);
    625            return (ch + t1) - offsetsFromUTF8[2];
    626        }
    627    } else {  // countTrailBytes == 3
    628        uint8_t t1 = *source, t2, t3;
    629        if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
    630                U8_IS_TRAIL(t3 = *++source)) {
    631            args->source = (const char *)(source + 1);
    632            return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
    633        }
    634    }
    635    args->source = (const char *)source;
    636 
    637    for(i = 0; sourceInitial < source; ++i) {
    638        cnv->toUBytes[i] = *sourceInitial++;
    639    }
    640    cnv->toULength = i;
    641    *err = U_ILLEGAL_CHAR_FOUND;
    642    return 0xffff;
    643 } 
    644 U_CDECL_END
    645 
    646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
    647 
    648 U_CDECL_BEGIN
    649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
    650 static void U_CALLCONV
    651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    652                  UConverterToUnicodeArgs *pToUArgs,
    653                  UErrorCode *pErrorCode) {
    654    UConverter *utf8;
    655    const uint8_t *source, *sourceLimit;
    656    uint8_t *target;
    657    int32_t targetCapacity;
    658    int32_t count;
    659 
    660    int8_t oldToULength, toULength, toULimit;
    661 
    662    UChar32 c;
    663    uint8_t b, t1, t2;
    664 
    665    /* set up the local pointers */
    666    utf8=pToUArgs->converter;
    667    source=(uint8_t *)pToUArgs->source;
    668    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
    669    target=(uint8_t *)pFromUArgs->target;
    670    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
    671 
    672    /* get the converter state from the UTF-8 UConverter */
    673    if(utf8->toULength > 0) {
    674        toULength=oldToULength=utf8->toULength;
    675        toULimit=(int8_t)utf8->mode;
    676        c=(UChar32)utf8->toUnicodeStatus;
    677    } else {
    678        toULength=oldToULength=toULimit=0;
    679        c = 0;
    680    }
    681 
    682    count=(int32_t)(sourceLimit-source)+oldToULength;
    683    if(count<toULimit) {
    684        /*
    685         * Not enough input to complete the partial character.
    686         * Jump to moreBytes below - it will not output to target.
    687         */
    688    } else if(targetCapacity<toULimit) {
    689        /*
    690         * Not enough target capacity to output the partial character.
    691         * Let the standard converter handle this.
    692         */
    693        *pErrorCode=U_USING_DEFAULT_WARNING;
    694        return;
    695    } else {
    696        // Use a single counter for source and target, counting the minimum of
    697        // the source length and the target capacity.
    698        // Let the standard converter handle edge cases.
    699        if(count>targetCapacity) {
    700            count=targetCapacity;
    701        }
    702 
    703        // The conversion loop checks count>0 only once per character.
    704        // If the buffer ends with a truncated sequence,
    705        // then we reduce the count to stop before that,
    706        // and collect the remaining bytes after the conversion loop.
    707 
    708        // Do not go back into the bytes that will be read for finishing a partial
    709        // sequence from the previous buffer.
    710        int32_t length=count-toULength;
    711        U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
    712        count=toULength+length;
    713    }
    714 
    715    if(c!=0) {
    716        utf8->toUnicodeStatus=0;
    717        utf8->toULength=0;
    718        goto moreBytes;
    719        /* See note in ucnv_SBCSFromUTF8() about this goto. */
    720    }
    721 
    722    /* conversion loop */
    723    while(count>0) {
    724        b=*source++;
    725        if(U8_IS_SINGLE(b)) {
    726            /* convert ASCII */
    727            *target++=b;
    728            --count;
    729            continue;
    730        } else {
    731            if(b>=0xe0) {
    732                if( /* handle U+0800..U+FFFF inline */
    733                    b<0xf0 &&
    734                    U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
    735                    U8_IS_TRAIL(t2=source[1])
    736                ) {
    737                    source+=2;
    738                    *target++=b;
    739                    *target++=t1;
    740                    *target++=t2;
    741                    count-=3;
    742                    continue;
    743                }
    744            } else {
    745                if( /* handle U+0080..U+07FF inline */
    746                    b>=0xc2 &&
    747                    U8_IS_TRAIL(t1=*source)
    748                ) {
    749                    ++source;
    750                    *target++=b;
    751                    *target++=t1;
    752                    count-=2;
    753                    continue;
    754                }
    755            }
    756 
    757            /* handle "complicated" and error cases, and continuing partial characters */
    758            oldToULength=0;
    759            toULength=1;
    760            toULimit=U8_COUNT_BYTES_NON_ASCII(b);
    761            c=b;
    762 moreBytes:
    763            while(toULength<toULimit) {
    764                if(source<sourceLimit) {
    765                    b=*source;
    766                    if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
    767                        ++source;
    768                        ++toULength;
    769                        c=(c<<6)+b;
    770                    } else {
    771                        break; /* sequence too short, stop with toULength<toULimit */
    772                    }
    773                } else {
    774                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
    775                    source-=(toULength-oldToULength);
    776                    while(oldToULength<toULength) {
    777                        utf8->toUBytes[oldToULength++]=*source++;
    778                    }
    779                    utf8->toUnicodeStatus=c;
    780                    utf8->toULength=toULength;
    781                    utf8->mode=toULimit;
    782                    pToUArgs->source=(char *)source;
    783                    pFromUArgs->target=(char *)target;
    784                    return;
    785                }
    786            }
    787 
    788            if(toULength!=toULimit) {
    789                /* error handling: illegal UTF-8 byte sequence */
    790                source-=(toULength-oldToULength);
    791                while(oldToULength<toULength) {
    792                    utf8->toUBytes[oldToULength++]=*source++;
    793                }
    794                utf8->toULength=toULength;
    795                pToUArgs->source=(char *)source;
    796                pFromUArgs->target=(char *)target;
    797                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    798                return;
    799            }
    800 
    801            /* copy the legal byte sequence to the target */
    802            {
    803                int8_t i;
    804 
    805                for(i=0; i<oldToULength; ++i) {
    806                    *target++=utf8->toUBytes[i];
    807                }
    808                source-=(toULength-oldToULength);
    809                for(; i<toULength; ++i) {
    810                    *target++=*source++;
    811                }
    812                count-=toULength;
    813            }
    814        }
    815    }
    816    U_ASSERT(count>=0);
    817 
    818    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
    819        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
    820            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    821        } else {
    822            b=*source;
    823            toULimit=U8_COUNT_BYTES(b);
    824            if(toULimit>(sourceLimit-source)) {
    825                /* collect a truncated byte sequence */
    826                toULength=0;
    827                c=b;
    828                for(;;) {
    829                    utf8->toUBytes[toULength++]=b;
    830                    if(++source==sourceLimit) {
    831                        /* partial byte sequence at end of source */
    832                        utf8->toUnicodeStatus=c;
    833                        utf8->toULength=toULength;
    834                        utf8->mode=toULimit;
    835                        break;
    836                    } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
    837                        utf8->toULength=toULength;
    838                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    839                        break;
    840                    }
    841                    c=(c<<6)+b;
    842                }
    843            } else {
    844                /* partial-sequence target overflow: fall back to the pivoting implementation */
    845                *pErrorCode=U_USING_DEFAULT_WARNING;
    846            }
    847        }
    848    }
    849 
    850    /* write back the updated pointers */
    851    pToUArgs->source=(char *)source;
    852    pFromUArgs->target=(char *)target;
    853 }
    854 
    855 U_CDECL_END
    856 
    857 /* UTF-8 converter data ----------------------------------------------------- */
    858 
    859 static const UConverterImpl _UTF8Impl={
    860    UCNV_UTF8,
    861 
    862    nullptr,
    863    nullptr,
    864 
    865    nullptr,
    866    nullptr,
    867    nullptr,
    868 
    869    ucnv_toUnicode_UTF8,
    870    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
    871    ucnv_fromUnicode_UTF8,
    872    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
    873    ucnv_getNextUChar_UTF8,
    874 
    875    nullptr,
    876    nullptr,
    877    nullptr,
    878    nullptr,
    879    ucnv_getNonSurrogateUnicodeSet,
    880 
    881    ucnv_UTF8FromUTF8,
    882    ucnv_UTF8FromUTF8
    883 };
    884 
    885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
    886 static const UConverterStaticData _UTF8StaticData={
    887    sizeof(UConverterStaticData),
    888    "UTF-8",
    889    1208, UCNV_IBM, UCNV_UTF8,
    890    1, 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
    891    { 0xef, 0xbf, 0xbd, 0 },3,false,false,
    892    0,
    893    0,
    894    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    895 };
    896 
    897 
    898 const UConverterSharedData _UTF8Data=
    899        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
    900 
    901 /* CESU-8 converter data ---------------------------------------------------- */
    902 
    903 static const UConverterImpl _CESU8Impl={
    904    UCNV_CESU8,
    905 
    906    nullptr,
    907    nullptr,
    908 
    909    nullptr,
    910    nullptr,
    911    nullptr,
    912 
    913    ucnv_toUnicode_UTF8,
    914    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
    915    ucnv_fromUnicode_UTF8,
    916    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
    917    nullptr,
    918 
    919    nullptr,
    920    nullptr,
    921    nullptr,
    922    nullptr,
    923    ucnv_getCompleteUnicodeSet,
    924 
    925    nullptr,
    926    nullptr
    927 };
    928 
    929 static const UConverterStaticData _CESU8StaticData={
    930    sizeof(UConverterStaticData),
    931    "CESU-8",
    932    9400, /* CCSID for CESU-8 */
    933    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
    934    { 0xef, 0xbf, 0xbd, 0 },3,false,false,
    935    0,
    936    0,
    937    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    938 };
    939 
    940 
    941 const UConverterSharedData _CESU8Data=
    942        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
    943 
    944 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE