[ tor-browser ].git.dasho

ucnv_u16.cpp (49762B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*  
      4 **********************************************************************
      5 *   Copyright (C) 2002-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u16.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION
     22 
     23 #include "unicode/ucnv.h"
     24 #include "unicode/uversion.h"
     25 #include "ucnv_bld.h"
     26 #include "ucnv_cnv.h"
     27 #include "cmemory.h"
     28 
     29 enum {
     30    UCNV_NEED_TO_WRITE_BOM=1
     31 };
     32 
     33 U_CDECL_BEGIN
     34 /*
     35 * The UTF-16 toUnicode implementation is also used for the Java-specific
     36 * "with BOM" variants of UTF-16BE and UTF-16LE.
     37 */
     38 static void  U_CALLCONV
     39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
     40                           UErrorCode *pErrorCode);
     41 
     42 /* UTF-16BE ----------------------------------------------------------------- */
     43 
     44 #if U_IS_BIG_ENDIAN
     45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
     46 #else
     47 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
     48 #endif
     49 
     50 
     51 static void  U_CALLCONV
     52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
     53                               UErrorCode *pErrorCode) {
     54    UConverter *cnv;
     55    const char16_t *source;
     56    char *target;
     57    int32_t *offsets;
     58 
     59    uint32_t targetCapacity, length, sourceIndex;
     60    char16_t c, trail;
     61    char overflow[4];
     62 
     63    source=pArgs->source;
     64    length=(int32_t)(pArgs->sourceLimit-source);
     65    if(length<=0) {
     66        /* no input, nothing to do */
     67        return;
     68    }
     69 
     70    cnv=pArgs->converter;
     71 
     72    /* write the BOM if necessary */
     73    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
     74        static const char bom[]={ (char)0xfeu, (char)0xffu };
     75        ucnv_fromUWriteBytes(cnv,
     76                             bom, 2,
     77                             &pArgs->target, pArgs->targetLimit,
     78                             &pArgs->offsets, -1,
     79                             pErrorCode);
     80        cnv->fromUnicodeStatus=0;
     81    }
     82 
     83    target=pArgs->target;
     84    if(target >= pArgs->targetLimit) {
     85        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     86        return;
     87    }
     88 
     89    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
     90    offsets=pArgs->offsets;
     91    sourceIndex=0;
     92 
     93    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
     94 
     95    if((c=(char16_t)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
     96        /* the last buffer ended with a lead surrogate, output the surrogate pair */
     97        ++source;
     98        --length;
     99        target[0]=(uint8_t)(c>>8);
    100        target[1]=(uint8_t)c;
    101        target[2]=(uint8_t)(trail>>8);
    102        target[3]=(uint8_t)trail;
    103        target+=4;
    104        targetCapacity-=4;
    105        if(offsets!=nullptr) {
    106            *offsets++=-1;
    107            *offsets++=-1;
    108            *offsets++=-1;
    109            *offsets++=-1;
    110        }
    111        sourceIndex=1;
    112        cnv->fromUChar32=c=0;
    113    }
    114 
    115    if(c==0) {
    116        /* copy an even number of bytes for complete UChars */
    117        uint32_t count=2*length;
    118        if(count>targetCapacity) {
    119            count=targetCapacity&~1;
    120        }
    121        /* count is even */
    122        targetCapacity-=count;
    123        count>>=1;
    124        length-=count;
    125 
    126        if(offsets==nullptr) {
    127            while(count>0) {
    128                c=*source++;
    129                if(U16_IS_SINGLE(c)) {
    130                    target[0]=(uint8_t)(c>>8);
    131                    target[1]=(uint8_t)c;
    132                    target+=2;
    133                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    134                    ++source;
    135                    --count;
    136                    target[0]=(uint8_t)(c>>8);
    137                    target[1]=(uint8_t)c;
    138                    target[2]=(uint8_t)(trail>>8);
    139                    target[3]=(uint8_t)trail;
    140                    target+=4;
    141                } else {
    142                    break;
    143                }
    144                --count;
    145            }
    146        } else {
    147            while(count>0) {
    148                c=*source++;
    149                if(U16_IS_SINGLE(c)) {
    150                    target[0]=(uint8_t)(c>>8);
    151                    target[1]=(uint8_t)c;
    152                    target+=2;
    153                    *offsets++=sourceIndex;
    154                    *offsets++=sourceIndex++;
    155                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    156                    ++source;
    157                    --count;
    158                    target[0]=(uint8_t)(c>>8);
    159                    target[1]=(uint8_t)c;
    160                    target[2]=(uint8_t)(trail>>8);
    161                    target[3]=(uint8_t)trail;
    162                    target+=4;
    163                    *offsets++=sourceIndex;
    164                    *offsets++=sourceIndex;
    165                    *offsets++=sourceIndex;
    166                    *offsets++=sourceIndex;
    167                    sourceIndex+=2;
    168                } else {
    169                    break;
    170                }
    171                --count;
    172            }
    173        }
    174 
    175        if(count==0) {
    176            /* done with the loop for complete UChars */
    177            if(length>0 && targetCapacity>0) {
    178                /*
    179                 * there is more input and some target capacity -
    180                 * it must be targetCapacity==1 because otherwise
    181                 * the above would have copied more;
    182                 * prepare for overflow output
    183                 */
    184                if(U16_IS_SINGLE(c=*source++)) {
    185                    overflow[0]=(char)(c>>8);
    186                    overflow[1]=(char)c;
    187                    length=2; /* 2 bytes to output */
    188                    c=0;
    189                /* } else { keep c for surrogate handling, length will be set there */
    190                }
    191            } else {
    192                length=0;
    193                c=0;
    194            }
    195        } else {
    196            /* keep c for surrogate handling, length will be set there */
    197            targetCapacity+=2*count;
    198        }
    199    } else {
    200        length=0; /* from here on, length counts the bytes in overflow[] */
    201    }
    202    
    203    if(c!=0) {
    204        /*
    205         * c is a surrogate, and
    206         * - source or target too short
    207         * - or the surrogate is unmatched
    208         */
    209        length=0;
    210        if(U16_IS_SURROGATE_LEAD(c)) {
    211            if(source<pArgs->sourceLimit) {
    212                if(U16_IS_TRAIL(trail=*source)) {
    213                    /* output the surrogate pair, will overflow (see conditions comment above) */
    214                    ++source;
    215                    overflow[0]=(char)(c>>8);
    216                    overflow[1]=(char)c;
    217                    overflow[2]=(char)(trail>>8);
    218                    overflow[3]=(char)trail;
    219                    length=4; /* 4 bytes to output */
    220                    c=0;
    221                } else {
    222                    /* unmatched lead surrogate */
    223                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    224                }
    225            } else {
    226                /* see if the trail surrogate is in the next buffer */
    227            }
    228        } else {
    229            /* unmatched trail surrogate */
    230            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    231        }
    232        cnv->fromUChar32=c;
    233    }
    234 
    235    if(length>0) {
    236        /* output length bytes with overflow (length>targetCapacity>0) */
    237        ucnv_fromUWriteBytes(cnv,
    238                             overflow, length,
    239                             &target, pArgs->targetLimit,
    240                             &offsets, sourceIndex,
    241                             pErrorCode);
    242        targetCapacity = static_cast<uint32_t>(pArgs->targetLimit - target);
    243    }
    244 
    245    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    246        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    247    }
    248 
    249    /* write back the updated pointers */
    250    pArgs->source=source;
    251    pArgs->target = target;
    252    pArgs->offsets=offsets;
    253 }
    254 
    255 static void  U_CALLCONV
    256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    257                             UErrorCode *pErrorCode) {
    258    UConverter *cnv;
    259    const uint8_t *source;
    260    char16_t *target;
    261    int32_t *offsets;
    262 
    263    uint32_t targetCapacity, length, count, sourceIndex;
    264    char16_t c, trail;
    265 
    266    if(pArgs->converter->mode<8) {
    267        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    268        return;
    269    }
    270 
    271    cnv=pArgs->converter;
    272    source=(const uint8_t *)pArgs->source;
    273    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    274    if(length<=0 && cnv->toUnicodeStatus==0) {
    275        /* no input, nothing to do */
    276        return;
    277    }
    278 
    279    target=pArgs->target;
    280    if(target >= pArgs->targetLimit) {
    281        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    282        return;
    283    }
    284 
    285    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
    286    offsets=pArgs->offsets;
    287    sourceIndex=0;
    288    c=0;
    289 
    290    /* complete a partial char16_t or pair from the last call */
    291    if(cnv->toUnicodeStatus!=0) {
    292        /*
    293         * special case: single byte from a previous buffer,
    294         * where the byte turned out not to belong to a trail surrogate
    295         * and the preceding, unmatched lead surrogate was put into toUBytes[]
    296         * for error handling
    297         */
    298        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    299        cnv->toULength=1;
    300        cnv->toUnicodeStatus=0;
    301    }
    302    if((count=cnv->toULength)!=0) {
    303        uint8_t *p=cnv->toUBytes;
    304        do {
    305            p[count++]=*source++;
    306            ++sourceIndex;
    307            --length;
    308            if(count==2) {
    309                c=((char16_t)p[0]<<8)|p[1];
    310                if(U16_IS_SINGLE(c)) {
    311                    /* output the BMP code point */
    312                    *target++=c;
    313                    if(offsets!=nullptr) {
    314                        *offsets++=-1;
    315                    }
    316                    --targetCapacity;
    317                    count=0;
    318                    c=0;
    319                    break;
    320                } else if(U16_IS_SURROGATE_LEAD(c)) {
    321                    /* continue collecting bytes for the trail surrogate */
    322                    c=0; /* avoid unnecessary surrogate handling below */
    323                } else {
    324                    /* fall through to error handling for an unmatched trail surrogate */
    325                    break;
    326                }
    327            } else if(count==4) {
    328                c=((char16_t)p[0]<<8)|p[1];
    329                trail=((char16_t)p[2]<<8)|p[3];
    330                if(U16_IS_TRAIL(trail)) {
    331                    /* output the surrogate pair */
    332                    *target++=c;
    333                    if(targetCapacity>=2) {
    334                        *target++=trail;
    335                        if(offsets!=nullptr) {
    336                            *offsets++=-1;
    337                            *offsets++=-1;
    338                        }
    339                        targetCapacity-=2;
    340                    } else /* targetCapacity==1 */ {
    341                        targetCapacity=0;
    342                        cnv->UCharErrorBuffer[0]=trail;
    343                        cnv->UCharErrorBufferLength=1;
    344                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    345                    }
    346                    count=0;
    347                    c=0;
    348                    break;
    349                } else {
    350                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    351                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    352 
    353                    /* back out reading the code unit after it */
    354                    if(((const uint8_t *)pArgs->source-source)>=2) {
    355                        source-=2;
    356                    } else {
    357                        /*
    358                         * if the trail unit's first byte was in a previous buffer, then
    359                         * we need to put it into a special place because toUBytes[] will be
    360                         * used for the lead unit's bytes
    361                         */
    362                        cnv->toUnicodeStatus=0x100|p[2];
    363                        --source;
    364                    }
    365                    cnv->toULength=2;
    366 
    367                    /* write back the updated pointers */
    368                    pArgs->source=(const char *)source;
    369                    pArgs->target=target;
    370                    pArgs->offsets=offsets;
    371                    return;
    372                }
    373            }
    374        } while(length>0);
    375        cnv->toULength=(int8_t)count;
    376    }
    377 
    378    /* copy an even number of bytes for complete UChars */
    379    count=2*targetCapacity;
    380    if(count>length) {
    381        count=length&~1;
    382    }
    383    if(c==0 && count>0) {
    384        length-=count;
    385        count>>=1;
    386        targetCapacity-=count;
    387        if(offsets==nullptr) {
    388            do {
    389                c=((char16_t)source[0]<<8)|source[1];
    390                source+=2;
    391                if(U16_IS_SINGLE(c)) {
    392                    *target++=c;
    393                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    394                          U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])
    395                ) {
    396                    source+=2;
    397                    --count;
    398                    *target++=c;
    399                    *target++=trail;
    400                } else {
    401                    break;
    402                }
    403            } while(--count>0);
    404        } else {
    405            do {
    406                c=((char16_t)source[0]<<8)|source[1];
    407                source+=2;
    408                if(U16_IS_SINGLE(c)) {
    409                    *target++=c;
    410                    *offsets++=sourceIndex;
    411                    sourceIndex+=2;
    412                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    413                          U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])
    414                ) {
    415                    source+=2;
    416                    --count;
    417                    *target++=c;
    418                    *target++=trail;
    419                    *offsets++=sourceIndex;
    420                    *offsets++=sourceIndex;
    421                    sourceIndex+=4;
    422                } else {
    423                    break;
    424                }
    425            } while(--count>0);
    426        }
    427 
    428        if(count==0) {
    429            /* done with the loop for complete UChars */
    430            c=0;
    431        } else {
    432            /* keep c for surrogate handling, trail will be set there */
    433            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
    434            targetCapacity+=count;
    435        }
    436    }
    437 
    438    if(c!=0) {
    439        /*
    440         * c is a surrogate, and
    441         * - source or target too short
    442         * - or the surrogate is unmatched
    443         */
    444        cnv->toUBytes[0]=(uint8_t)(c>>8);
    445        cnv->toUBytes[1]=(uint8_t)c;
    446        cnv->toULength=2;
    447 
    448        if(U16_IS_SURROGATE_LEAD(c)) {
    449            if(length>=2) {
    450                if(U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])) {
    451                    /* output the surrogate pair, will overflow (see conditions comment above) */
    452                    source+=2;
    453                    length-=2;
    454                    *target++=c;
    455                    if(offsets!=nullptr) {
    456                        *offsets++=sourceIndex;
    457                    }
    458                    cnv->UCharErrorBuffer[0]=trail;
    459                    cnv->UCharErrorBufferLength=1;
    460                    cnv->toULength=0;
    461                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    462                } else {
    463                    /* unmatched lead surrogate */
    464                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    465                }
    466            } else {
    467                /* see if the trail surrogate is in the next buffer */
    468            }
    469        } else {
    470            /* unmatched trail surrogate */
    471            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    472        }
    473    }
    474 
    475    if(U_SUCCESS(*pErrorCode)) {
    476        /* check for a remaining source byte */
    477        if(length>0) {
    478            if(targetCapacity==0) {
    479                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    480            } else {
    481                /* it must be length==1 because otherwise the above would have copied more */
    482                cnv->toUBytes[cnv->toULength++]=*source++;
    483            }
    484        }
    485    }
    486 
    487    /* write back the updated pointers */
    488    pArgs->source=(const char *)source;
    489    pArgs->target=target;
    490    pArgs->offsets=offsets;
    491 }
    492 
    493 static UChar32  U_CALLCONV
    494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
    495    const uint8_t *s, *sourceLimit;
    496    UChar32 c;
    497 
    498    if(pArgs->converter->mode<8) {
    499        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
    500    }
    501 
    502    s=(const uint8_t *)pArgs->source;
    503    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    504 
    505    if(s>=sourceLimit) {
    506        /* no input */
    507        *err=U_INDEX_OUTOFBOUNDS_ERROR;
    508        return 0xffff;
    509    }
    510 
    511    if(s+2>sourceLimit) {
    512        /* only one byte: truncated char16_t */
    513        pArgs->converter->toUBytes[0]=*s++;
    514        pArgs->converter->toULength=1;
    515        pArgs->source=(const char *)s;
    516        *err = U_TRUNCATED_CHAR_FOUND;
    517        return 0xffff;
    518    }
    519 
    520    /* get one char16_t */
    521    c=((UChar32)*s<<8)|s[1];
    522    s+=2;
    523 
    524    /* check for a surrogate pair */
    525    if(U_IS_SURROGATE(c)) {
    526        if(U16_IS_SURROGATE_LEAD(c)) {
    527            if(s+2<=sourceLimit) {
    528                char16_t trail;
    529 
    530                /* get a second char16_t and see if it is a trail surrogate */
    531                trail=((char16_t)*s<<8)|s[1];
    532                if(U16_IS_TRAIL(trail)) {
    533                    c=U16_GET_SUPPLEMENTARY(c, trail);
    534                    s+=2;
    535                } else {
    536                    /* unmatched lead surrogate */
    537                    c=-2;
    538                }
    539            } else {
    540                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
    541                uint8_t *bytes=pArgs->converter->toUBytes;
    542                s-=2;
    543                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
    544                do {
    545                    *bytes++=*s++;
    546                } while(s<sourceLimit);
    547 
    548                c=0xffff;
    549                *err=U_TRUNCATED_CHAR_FOUND;
    550            }
    551        } else {
    552            /* unmatched trail surrogate */
    553            c=-2;
    554        }
    555 
    556        if(c<0) {
    557            /* write the unmatched surrogate */
    558            uint8_t *bytes=pArgs->converter->toUBytes;
    559            pArgs->converter->toULength=2;
    560            *bytes=*(s-2);
    561            bytes[1]=*(s-1);
    562 
    563            c=0xffff;
    564            *err=U_ILLEGAL_CHAR_FOUND;
    565        }
    566    }
    567 
    568    pArgs->source=(const char *)s;
    569    return c;
    570 } 
    571 
    572 static void  U_CALLCONV
    573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
    574    if(choice<=UCNV_RESET_TO_UNICODE) {
    575        /* reset toUnicode state */
    576        if(UCNV_GET_VERSION(cnv)==0) {
    577            cnv->mode=8; /* no BOM handling */
    578        } else {
    579            cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
    580        }
    581    }
    582    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
    583        /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
    584        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
    585    }
    586 }
    587 
    588 static void  U_CALLCONV
    589 _UTF16BEOpen(UConverter *cnv,
    590             UConverterLoadArgs *pArgs,
    591             UErrorCode *pErrorCode) {
    592    (void)pArgs;
    593    if(UCNV_GET_VERSION(cnv)<=1) {
    594        _UTF16BEReset(cnv, UCNV_RESET_BOTH);
    595    } else {
    596        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    597    }
    598 }
    599 
    600 static const char *  U_CALLCONV
    601 _UTF16BEGetName(const UConverter *cnv) {
    602    if(UCNV_GET_VERSION(cnv)==0) {
    603        return "UTF-16BE";
    604    } else {
    605        return "UTF-16BE,version=1";
    606    }
    607 }
    608 U_CDECL_END
    609 
    610 static const UConverterImpl _UTF16BEImpl={
    611    UCNV_UTF16_BigEndian,
    612 
    613    nullptr,
    614    nullptr,
    615 
    616    _UTF16BEOpen,
    617    nullptr,
    618    _UTF16BEReset,
    619 
    620    _UTF16BEToUnicodeWithOffsets,
    621    _UTF16BEToUnicodeWithOffsets,
    622    _UTF16BEFromUnicodeWithOffsets,
    623    _UTF16BEFromUnicodeWithOffsets,
    624    _UTF16BEGetNextUChar,
    625 
    626    nullptr,
    627    _UTF16BEGetName,
    628    nullptr,
    629    nullptr,
    630    ucnv_getNonSurrogateUnicodeSet,
    631 
    632    nullptr,
    633    nullptr
    634 };
    635 
    636 static const UConverterStaticData _UTF16BEStaticData={
    637    sizeof(UConverterStaticData),
    638    "UTF-16BE",
    639    1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
    640    { 0xff, 0xfd, 0, 0 },2,false,false,
    641    0,
    642    0,
    643    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    644 };
    645 
    646 
    647 const UConverterSharedData _UTF16BEData=
    648        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
    649 
    650 /* UTF-16LE ----------------------------------------------------------------- */
    651 U_CDECL_BEGIN
    652 static void  U_CALLCONV
    653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    654                               UErrorCode *pErrorCode) {
    655    UConverter *cnv;
    656    const char16_t *source;
    657    char *target;
    658    int32_t *offsets;
    659 
    660    uint32_t targetCapacity, length, sourceIndex;
    661    char16_t c, trail;
    662    char overflow[4];
    663 
    664    source=pArgs->source;
    665    length=(int32_t)(pArgs->sourceLimit-source);
    666    if(length<=0) {
    667        /* no input, nothing to do */
    668        return;
    669    }
    670 
    671    cnv=pArgs->converter;
    672 
    673    /* write the BOM if necessary */
    674    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    675        static const char bom[]={ (char)0xffu, (char)0xfeu };
    676        ucnv_fromUWriteBytes(cnv,
    677                             bom, 2,
    678                             &pArgs->target, pArgs->targetLimit,
    679                             &pArgs->offsets, -1,
    680                             pErrorCode);
    681        cnv->fromUnicodeStatus=0;
    682    }
    683 
    684    target=pArgs->target;
    685    if(target >= pArgs->targetLimit) {
    686        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    687        return;
    688    }
    689 
    690    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    691    offsets=pArgs->offsets;
    692    sourceIndex=0;
    693 
    694    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
    695 
    696    if((c=(char16_t)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
    697        /* the last buffer ended with a lead surrogate, output the surrogate pair */
    698        ++source;
    699        --length;
    700        target[0]=(uint8_t)c;
    701        target[1]=(uint8_t)(c>>8);
    702        target[2]=(uint8_t)trail;
    703        target[3]=(uint8_t)(trail>>8);
    704        target+=4;
    705        targetCapacity-=4;
    706        if(offsets!=nullptr) {
    707            *offsets++=-1;
    708            *offsets++=-1;
    709            *offsets++=-1;
    710            *offsets++=-1;
    711        }
    712        sourceIndex=1;
    713        cnv->fromUChar32=c=0;
    714    }
    715 
    716    if(c==0) {
    717        /* copy an even number of bytes for complete UChars */
    718        uint32_t count=2*length;
    719        if(count>targetCapacity) {
    720            count=targetCapacity&~1;
    721        }
    722        /* count is even */
    723        targetCapacity-=count;
    724        count>>=1;
    725        length-=count;
    726 
    727        if(offsets==nullptr) {
    728            while(count>0) {
    729                c=*source++;
    730                if(U16_IS_SINGLE(c)) {
    731                    target[0]=(uint8_t)c;
    732                    target[1]=(uint8_t)(c>>8);
    733                    target+=2;
    734                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    735                    ++source;
    736                    --count;
    737                    target[0]=(uint8_t)c;
    738                    target[1]=(uint8_t)(c>>8);
    739                    target[2]=(uint8_t)trail;
    740                    target[3]=(uint8_t)(trail>>8);
    741                    target+=4;
    742                } else {
    743                    break;
    744                }
    745                --count;
    746            }
    747        } else {
    748            while(count>0) {
    749                c=*source++;
    750                if(U16_IS_SINGLE(c)) {
    751                    target[0]=(uint8_t)c;
    752                    target[1]=(uint8_t)(c>>8);
    753                    target+=2;
    754                    *offsets++=sourceIndex;
    755                    *offsets++=sourceIndex++;
    756                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    757                    ++source;
    758                    --count;
    759                    target[0]=(uint8_t)c;
    760                    target[1]=(uint8_t)(c>>8);
    761                    target[2]=(uint8_t)trail;
    762                    target[3]=(uint8_t)(trail>>8);
    763                    target+=4;
    764                    *offsets++=sourceIndex;
    765                    *offsets++=sourceIndex;
    766                    *offsets++=sourceIndex;
    767                    *offsets++=sourceIndex;
    768                    sourceIndex+=2;
    769                } else {
    770                    break;
    771                }
    772                --count;
    773            }
    774        }
    775 
    776        if(count==0) {
    777            /* done with the loop for complete UChars */
    778            if(length>0 && targetCapacity>0) {
    779                /*
    780                 * there is more input and some target capacity -
    781                 * it must be targetCapacity==1 because otherwise
    782                 * the above would have copied more;
    783                 * prepare for overflow output
    784                 */
    785                if(U16_IS_SINGLE(c=*source++)) {
    786                    overflow[0]=(char)c;
    787                    overflow[1]=(char)(c>>8);
    788                    length=2; /* 2 bytes to output */
    789                    c=0;
    790                /* } else { keep c for surrogate handling, length will be set there */
    791                }
    792            } else {
    793                length=0;
    794                c=0;
    795            }
    796        } else {
    797            /* keep c for surrogate handling, length will be set there */
    798            targetCapacity+=2*count;
    799        }
    800    } else {
    801        length=0; /* from here on, length counts the bytes in overflow[] */
    802    }
    803    
    804    if(c!=0) {
    805        /*
    806         * c is a surrogate, and
    807         * - source or target too short
    808         * - or the surrogate is unmatched
    809         */
    810        length=0;
    811        if(U16_IS_SURROGATE_LEAD(c)) {
    812            if(source<pArgs->sourceLimit) {
    813                if(U16_IS_TRAIL(trail=*source)) {
    814                    /* output the surrogate pair, will overflow (see conditions comment above) */
    815                    ++source;
    816                    overflow[0]=(char)c;
    817                    overflow[1]=(char)(c>>8);
    818                    overflow[2]=(char)trail;
    819                    overflow[3]=(char)(trail>>8);
    820                    length=4; /* 4 bytes to output */
    821                    c=0;
    822                } else {
    823                    /* unmatched lead surrogate */
    824                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    825                }
    826            } else {
    827                /* see if the trail surrogate is in the next buffer */
    828            }
    829        } else {
    830            /* unmatched trail surrogate */
    831            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    832        }
    833        cnv->fromUChar32=c;
    834    }
    835 
    836    if(length>0) {
    837        /* output length bytes with overflow (length>targetCapacity>0) */
    838        ucnv_fromUWriteBytes(cnv,
    839                             overflow, length,
    840                             &target, pArgs->targetLimit,
    841                             &offsets, sourceIndex,
    842                             pErrorCode);
    843        targetCapacity = static_cast<uint32_t>(pArgs->targetLimit - target);
    844    }
    845 
    846    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    847        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    848    }
    849 
    850    /* write back the updated pointers */
    851    pArgs->source=source;
    852    pArgs->target=target;
    853    pArgs->offsets=offsets;
    854 }
    855 
    856 static void  U_CALLCONV
    857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    858                             UErrorCode *pErrorCode) {
    859    UConverter *cnv;
    860    const uint8_t *source;
    861    char16_t *target;
    862    int32_t *offsets;
    863 
    864    uint32_t targetCapacity, length, count, sourceIndex;
    865    char16_t c, trail;
    866 
    867    if(pArgs->converter->mode<8) {
    868        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    869        return;
    870    }
    871 
    872    cnv=pArgs->converter;
    873    source=(const uint8_t *)pArgs->source;
    874    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    875    if(length<=0 && cnv->toUnicodeStatus==0) {
    876        /* no input, nothing to do */
    877        return;
    878    }
    879 
    880    target=pArgs->target;
    881    if(target >= pArgs->targetLimit) {
    882        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    883        return;
    884    }
    885 
    886    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    887    offsets=pArgs->offsets;
    888    sourceIndex=0;
    889    c=0;
    890 
    891    /* complete a partial char16_t or pair from the last call */
    892    if(cnv->toUnicodeStatus!=0) {
    893        /*
    894         * special case: single byte from a previous buffer,
    895         * where the byte turned out not to belong to a trail surrogate
    896         * and the preceding, unmatched lead surrogate was put into toUBytes[]
    897         * for error handling
    898         */
    899        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    900        cnv->toULength=1;
    901        cnv->toUnicodeStatus=0;
    902    }
    903    if((count=cnv->toULength)!=0) {
    904        uint8_t *p=cnv->toUBytes;
    905        do {
    906            p[count++]=*source++;
    907            ++sourceIndex;
    908            --length;
    909            if(count==2) {
    910                c=((char16_t)p[1]<<8)|p[0];
    911                if(U16_IS_SINGLE(c)) {
    912                    /* output the BMP code point */
    913                    *target++=c;
    914                    if(offsets!=nullptr) {
    915                        *offsets++=-1;
    916                    }
    917                    --targetCapacity;
    918                    count=0;
    919                    c=0;
    920                    break;
    921                } else if(U16_IS_SURROGATE_LEAD(c)) {
    922                    /* continue collecting bytes for the trail surrogate */
    923                    c=0; /* avoid unnecessary surrogate handling below */
    924                } else {
    925                    /* fall through to error handling for an unmatched trail surrogate */
    926                    break;
    927                }
    928            } else if(count==4) {
    929                c=((char16_t)p[1]<<8)|p[0];
    930                trail=((char16_t)p[3]<<8)|p[2];
    931                if(U16_IS_TRAIL(trail)) {
    932                    /* output the surrogate pair */
    933                    *target++=c;
    934                    if(targetCapacity>=2) {
    935                        *target++=trail;
    936                        if(offsets!=nullptr) {
    937                            *offsets++=-1;
    938                            *offsets++=-1;
    939                        }
    940                        targetCapacity-=2;
    941                    } else /* targetCapacity==1 */ {
    942                        targetCapacity=0;
    943                        cnv->UCharErrorBuffer[0]=trail;
    944                        cnv->UCharErrorBufferLength=1;
    945                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    946                    }
    947                    count=0;
    948                    c=0;
    949                    break;
    950                } else {
    951                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    952                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    953 
    954                    /* back out reading the code unit after it */
    955                    if(((const uint8_t *)pArgs->source-source)>=2) {
    956                        source-=2;
    957                    } else {
    958                        /*
    959                         * if the trail unit's first byte was in a previous buffer, then
    960                         * we need to put it into a special place because toUBytes[] will be
    961                         * used for the lead unit's bytes
    962                         */
    963                        cnv->toUnicodeStatus=0x100|p[2];
    964                        --source;
    965                    }
    966                    cnv->toULength=2;
    967 
    968                    /* write back the updated pointers */
    969                    pArgs->source=(const char *)source;
    970                    pArgs->target=target;
    971                    pArgs->offsets=offsets;
    972                    return;
    973                }
    974            }
    975        } while(length>0);
    976        cnv->toULength=(int8_t)count;
    977    }
    978 
    979    /* copy an even number of bytes for complete UChars */
    980    count=2*targetCapacity;
    981    if(count>length) {
    982        count=length&~1;
    983    }
    984    if(c==0 && count>0) {
    985        length-=count;
    986        count>>=1;
    987        targetCapacity-=count;
    988        if(offsets==nullptr) {
    989            do {
    990                c=((char16_t)source[1]<<8)|source[0];
    991                source+=2;
    992                if(U16_IS_SINGLE(c)) {
    993                    *target++=c;
    994                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    995                          U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])
    996                ) {
    997                    source+=2;
    998                    --count;
    999                    *target++=c;
   1000                    *target++=trail;
   1001                } else {
   1002                    break;
   1003                }
   1004            } while(--count>0);
   1005        } else {
   1006            do {
   1007                c=((char16_t)source[1]<<8)|source[0];
   1008                source+=2;
   1009                if(U16_IS_SINGLE(c)) {
   1010                    *target++=c;
   1011                    *offsets++=sourceIndex;
   1012                    sourceIndex+=2;
   1013                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1014                          U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])
   1015                ) {
   1016                    source+=2;
   1017                    --count;
   1018                    *target++=c;
   1019                    *target++=trail;
   1020                    *offsets++=sourceIndex;
   1021                    *offsets++=sourceIndex;
   1022                    sourceIndex+=4;
   1023                } else {
   1024                    break;
   1025                }
   1026            } while(--count>0);
   1027        }
   1028 
   1029        if(count==0) {
   1030            /* done with the loop for complete UChars */
   1031            c=0;
   1032        } else {
   1033            /* keep c for surrogate handling, trail will be set there */
   1034            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
   1035            targetCapacity+=count;
   1036        }
   1037    }
   1038 
   1039    if(c!=0) {
   1040        /*
   1041         * c is a surrogate, and
   1042         * - source or target too short
   1043         * - or the surrogate is unmatched
   1044         */
   1045        cnv->toUBytes[0]=(uint8_t)c;
   1046        cnv->toUBytes[1]=(uint8_t)(c>>8);
   1047        cnv->toULength=2;
   1048 
   1049        if(U16_IS_SURROGATE_LEAD(c)) {
   1050            if(length>=2) {
   1051                if(U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])) {
   1052                    /* output the surrogate pair, will overflow (see conditions comment above) */
   1053                    source+=2;
   1054                    length-=2;
   1055                    *target++=c;
   1056                    if(offsets!=nullptr) {
   1057                        *offsets++=sourceIndex;
   1058                    }
   1059                    cnv->UCharErrorBuffer[0]=trail;
   1060                    cnv->UCharErrorBufferLength=1;
   1061                    cnv->toULength=0;
   1062                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1063                } else {
   1064                    /* unmatched lead surrogate */
   1065                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1066                }
   1067            } else {
   1068                /* see if the trail surrogate is in the next buffer */
   1069            }
   1070        } else {
   1071            /* unmatched trail surrogate */
   1072            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1073        }
   1074    }
   1075 
   1076    if(U_SUCCESS(*pErrorCode)) {
   1077        /* check for a remaining source byte */
   1078        if(length>0) {
   1079            if(targetCapacity==0) {
   1080                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1081            } else {
   1082                /* it must be length==1 because otherwise the above would have copied more */
   1083                cnv->toUBytes[cnv->toULength++]=*source++;
   1084            }
   1085        }
   1086    }
   1087 
   1088    /* write back the updated pointers */
   1089    pArgs->source=(const char *)source;
   1090    pArgs->target=target;
   1091    pArgs->offsets=offsets;
   1092 }
   1093 
   1094 static UChar32  U_CALLCONV
   1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
   1096    const uint8_t *s, *sourceLimit;
   1097    UChar32 c;
   1098 
   1099    if(pArgs->converter->mode<8) {
   1100        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1101    }
   1102 
   1103    s=(const uint8_t *)pArgs->source;
   1104    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1105 
   1106    if(s>=sourceLimit) {
   1107        /* no input */
   1108        *err=U_INDEX_OUTOFBOUNDS_ERROR;
   1109        return 0xffff;
   1110    }
   1111 
   1112    if(s+2>sourceLimit) {
   1113        /* only one byte: truncated char16_t */
   1114        pArgs->converter->toUBytes[0]=*s++;
   1115        pArgs->converter->toULength=1;
   1116        pArgs->source=(const char *)s;
   1117        *err = U_TRUNCATED_CHAR_FOUND;
   1118        return 0xffff;
   1119    }
   1120 
   1121    /* get one char16_t */
   1122    c=((UChar32)s[1]<<8)|*s;
   1123    s+=2;
   1124 
   1125    /* check for a surrogate pair */
   1126    if(U_IS_SURROGATE(c)) {
   1127        if(U16_IS_SURROGATE_LEAD(c)) {
   1128            if(s+2<=sourceLimit) {
   1129                char16_t trail;
   1130 
   1131                /* get a second char16_t and see if it is a trail surrogate */
   1132                trail=((char16_t)s[1]<<8)|*s;
   1133                if(U16_IS_TRAIL(trail)) {
   1134                    c=U16_GET_SUPPLEMENTARY(c, trail);
   1135                    s+=2;
   1136                } else {
   1137                    /* unmatched lead surrogate */
   1138                    c=-2;
   1139                }
   1140            } else {
   1141                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
   1142                uint8_t *bytes=pArgs->converter->toUBytes;
   1143                s-=2;
   1144                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
   1145                do {
   1146                    *bytes++=*s++;
   1147                } while(s<sourceLimit);
   1148 
   1149                c=0xffff;
   1150                *err=U_TRUNCATED_CHAR_FOUND;
   1151            }
   1152        } else {
   1153            /* unmatched trail surrogate */
   1154            c=-2;
   1155        }
   1156 
   1157        if(c<0) {
   1158            /* write the unmatched surrogate */
   1159            uint8_t *bytes=pArgs->converter->toUBytes;
   1160            pArgs->converter->toULength=2;
   1161            *bytes=*(s-2);
   1162            bytes[1]=*(s-1);
   1163 
   1164            c=0xffff;
   1165            *err=U_ILLEGAL_CHAR_FOUND;
   1166        }
   1167    }
   1168 
   1169    pArgs->source=(const char *)s;
   1170    return c;
   1171 } 
   1172 
   1173 static void  U_CALLCONV
   1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
   1175    if(choice<=UCNV_RESET_TO_UNICODE) {
   1176        /* reset toUnicode state */
   1177        if(UCNV_GET_VERSION(cnv)==0) {
   1178            cnv->mode=8; /* no BOM handling */
   1179        } else {
   1180            cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
   1181        }
   1182    }
   1183    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
   1184        /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
   1185        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1186    }
   1187 }
   1188 
   1189 static void  U_CALLCONV
   1190 _UTF16LEOpen(UConverter *cnv,
   1191             UConverterLoadArgs *pArgs,
   1192             UErrorCode *pErrorCode) {
   1193    (void)pArgs;
   1194    if(UCNV_GET_VERSION(cnv)<=1) {
   1195        _UTF16LEReset(cnv, UCNV_RESET_BOTH);
   1196    } else {
   1197        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1198    }
   1199 }
   1200 
   1201 static const char *  U_CALLCONV
   1202 _UTF16LEGetName(const UConverter *cnv) {
   1203    if(UCNV_GET_VERSION(cnv)==0) {
   1204        return "UTF-16LE";
   1205    } else {
   1206        return "UTF-16LE,version=1";
   1207    }
   1208 }
   1209 U_CDECL_END
   1210 
   1211 static const UConverterImpl _UTF16LEImpl={
   1212    UCNV_UTF16_LittleEndian,
   1213 
   1214    nullptr,
   1215    nullptr,
   1216 
   1217    _UTF16LEOpen,
   1218    nullptr,
   1219    _UTF16LEReset,
   1220 
   1221    _UTF16LEToUnicodeWithOffsets,
   1222    _UTF16LEToUnicodeWithOffsets,
   1223    _UTF16LEFromUnicodeWithOffsets,
   1224    _UTF16LEFromUnicodeWithOffsets,
   1225    _UTF16LEGetNextUChar,
   1226 
   1227    nullptr,
   1228    _UTF16LEGetName,
   1229    nullptr,
   1230    nullptr,
   1231    ucnv_getNonSurrogateUnicodeSet,
   1232 
   1233    nullptr,
   1234    nullptr
   1235 };
   1236 
   1237 
   1238 static const UConverterStaticData _UTF16LEStaticData={
   1239    sizeof(UConverterStaticData),
   1240    "UTF-16LE",
   1241    1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
   1242    { 0xfd, 0xff, 0, 0 },2,false,false,
   1243    0,
   1244    0,
   1245    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1246 };
   1247 
   1248 
   1249 const UConverterSharedData _UTF16LEData=
   1250        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
   1251 
   1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
   1253 
   1254 /*
   1255 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
   1256 * accordingly.
   1257 * This is a simpler version of the UTF-32 converter, with
   1258 * fewer states for shorter BOMs.
   1259 *
   1260 * State values:
   1261 * 0    initial state
   1262 * 1    saw first byte
   1263 * 2..5 -
   1264 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
   1265 * 8    UTF-16BE mode
   1266 * 9    UTF-16LE mode
   1267 *
   1268 * During detection: state==number of initial bytes seen so far.
   1269 *
   1270 * On output, emit U+FEFF as the first code point.
   1271 *
   1272 * Variants:
   1273 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
   1274 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
   1275 *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
   1276 */
   1277 U_CDECL_BEGIN
   1278 static void  U_CALLCONV
   1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
   1280    if(choice<=UCNV_RESET_TO_UNICODE) {
   1281        /* reset toUnicode: state=0 */
   1282        cnv->mode=0;
   1283    }
   1284    if(choice!=UCNV_RESET_TO_UNICODE) {
   1285        /* reset fromUnicode: prepare to output the UTF-16PE BOM */
   1286        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1287    }
   1288 }
   1289 U_CDECL_END
   1290 extern const UConverterSharedData _UTF16v2Data;
   1291 U_CDECL_BEGIN
   1292 static void U_CALLCONV
   1293 _UTF16Open(UConverter *cnv,
   1294           UConverterLoadArgs *pArgs,
   1295           UErrorCode *pErrorCode) {
   1296    if(UCNV_GET_VERSION(cnv)<=2) {
   1297        if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
   1298            /*
   1299             * Switch implementation, and switch the staticData that's different
   1300             * and was copied into the UConverter.
   1301             * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
   1302             * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
   1303             */
   1304            cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
   1305            uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
   1306        }
   1307        _UTF16Reset(cnv, UCNV_RESET_BOTH);
   1308    } else {
   1309        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1310    }
   1311 }
   1312 
   1313 static const char *  U_CALLCONV
   1314 _UTF16GetName(const UConverter *cnv) {
   1315    if(UCNV_GET_VERSION(cnv)==0) {
   1316        return "UTF-16";
   1317    } else if(UCNV_GET_VERSION(cnv)==1) {
   1318        return "UTF-16,version=1";
   1319    } else {
   1320        return "UTF-16,version=2";
   1321    }
   1322 }
   1323 U_CDECL_END
   1324 extern const UConverterSharedData _UTF16Data;
   1325 
   1326 static inline bool IS_UTF16BE(const UConverter *cnv) {
   1327    return ((cnv)->sharedData == &_UTF16BEData);
   1328 }
   1329 
   1330 static inline bool IS_UTF16LE(const UConverter *cnv) {
   1331    return ((cnv)->sharedData == &_UTF16LEData);
   1332 }
   1333 
   1334 static inline bool IS_UTF16(const UConverter *cnv) {
   1335    return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
   1336 }
   1337 
   1338 U_CDECL_BEGIN
   1339 static void U_CALLCONV
   1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1341                           UErrorCode *pErrorCode) {
   1342    UConverter *cnv=pArgs->converter;
   1343    const char *source=pArgs->source;
   1344    const char *sourceLimit=pArgs->sourceLimit;
   1345    int32_t *offsets=pArgs->offsets;
   1346 
   1347    int32_t state, offsetDelta;
   1348    uint8_t b;
   1349 
   1350    state=cnv->mode;
   1351 
   1352    /*
   1353     * If we detect a BOM in this buffer, then we must add the BOM size to the
   1354     * offsets because the actual converter function will not see and count the BOM.
   1355     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1356     */
   1357    offsetDelta=0;
   1358 
   1359    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1360        switch(state) {
   1361        case 0:
   1362            cnv->toUBytes[0]=(uint8_t)*source++;
   1363            cnv->toULength=1;
   1364            state=1;
   1365            break;
   1366        case 1:
   1367            /*
   1368             * Only inside this switch case can the state variable
   1369             * temporarily take two additional values:
   1370             * 6: BOM error, continue with BE
   1371             * 7: BOM error, continue with LE
   1372             */
   1373            b=*source;
   1374            if(cnv->toUBytes[0]==0xfe && b==0xff) {
   1375                if(IS_UTF16LE(cnv)) {
   1376                    state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
   1377                } else {
   1378                    state=8; /* detect UTF-16BE */
   1379                }
   1380            } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
   1381                if(IS_UTF16BE(cnv)) {
   1382                    state=6; /* illegal reverse BOM for Java "UnicodeBig" */
   1383                } else {
   1384                    state=9; /* detect UTF-16LE */
   1385                }
   1386            } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
   1387                state=6; /* illegal missing BOM for Java "Unicode" */
   1388            }
   1389            if(state>=8) {
   1390                /* BOM detected, consume it */
   1391                ++source;
   1392                cnv->toULength=0;
   1393                offsetDelta=(int32_t)(source-pArgs->source);
   1394            } else if(state<6) {
   1395                /* ok: no BOM, and not a reverse BOM */
   1396                if(source!=pArgs->source) {
   1397                    /* reset the source for a correct first offset */
   1398                    source=pArgs->source;
   1399                    cnv->toULength=0;
   1400                }
   1401                if(IS_UTF16LE(cnv)) {
   1402                    /* Make Java "UnicodeLittle" default to LE. */
   1403                    state=9;
   1404                } else {
   1405                    /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
   1406                    state=8;
   1407                }
   1408            } else {
   1409                /*
   1410                 * error: missing BOM, or reverse BOM
   1411                 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
   1412                 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
   1413                 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
   1414                 */
   1415                /* report the non-BOM or reverse BOM as an illegal sequence */
   1416                cnv->toUBytes[1]=b;
   1417                cnv->toULength=2;
   1418                pArgs->source=source+1;
   1419                /* continue with conversion if the callback resets the error */
   1420                /*
   1421                 * Make Java "Unicode" default to BE like standard UTF-16.
   1422                 * Make Java "UnicodeBig" and "UnicodeLittle" default
   1423                 * to their normal endiannesses.
   1424                 */
   1425                cnv->mode=state+2;
   1426                *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
   1427                return;
   1428            }
   1429            /* convert the rest of the stream */
   1430            cnv->mode=state;
   1431            continue;
   1432        case 8:
   1433            /* call UTF-16BE */
   1434            pArgs->source=source;
   1435            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1436            source=pArgs->source;
   1437            break;
   1438        case 9:
   1439            /* call UTF-16LE */
   1440            pArgs->source=source;
   1441            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1442            source=pArgs->source;
   1443            break;
   1444        default:
   1445            break; /* does not occur */
   1446        }
   1447    }
   1448 
   1449    /* add BOM size to offsets - see comment at offsetDelta declaration */
   1450    if(offsets!=nullptr && offsetDelta!=0) {
   1451        int32_t *offsetsLimit=pArgs->offsets;
   1452        while(offsets<offsetsLimit) {
   1453            *offsets++ += offsetDelta;
   1454        }
   1455    }
   1456 
   1457    pArgs->source=source;
   1458 
   1459    if(source==sourceLimit && pArgs->flush) {
   1460        /* handle truncated input */
   1461        switch(state) {
   1462        case 0:
   1463            break; /* no input at all, nothing to do */
   1464        case 8:
   1465            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1466            break;
   1467        case 9:
   1468            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1469            break;
   1470        default:
   1471            /* 0<state<8: framework will report truncation, nothing to do here */
   1472            break;
   1473        }
   1474    }
   1475 
   1476    cnv->mode=state;
   1477 }
   1478 
   1479 static UChar32 U_CALLCONV
   1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1481                   UErrorCode *pErrorCode) {
   1482    switch(pArgs->converter->mode) {
   1483    case 8:
   1484        return _UTF16BEGetNextUChar(pArgs, pErrorCode);
   1485    case 9:
   1486        return _UTF16LEGetNextUChar(pArgs, pErrorCode);
   1487    default:
   1488        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1489    }
   1490 }
   1491 U_CDECL_END
   1492 
   1493 static const UConverterImpl _UTF16Impl = {
   1494    UCNV_UTF16,
   1495 
   1496    nullptr,
   1497    nullptr,
   1498 
   1499    _UTF16Open,
   1500    nullptr,
   1501    _UTF16Reset,
   1502 
   1503    _UTF16ToUnicodeWithOffsets,
   1504    _UTF16ToUnicodeWithOffsets,
   1505    _UTF16PEFromUnicodeWithOffsets,
   1506    _UTF16PEFromUnicodeWithOffsets,
   1507    _UTF16GetNextUChar,
   1508 
   1509    nullptr, /* ### TODO implement getStarters for all Unicode encodings?! */
   1510    _UTF16GetName,
   1511    nullptr,
   1512    nullptr,
   1513    ucnv_getNonSurrogateUnicodeSet,
   1514 
   1515    nullptr,
   1516    nullptr
   1517 };
   1518 
   1519 static const UConverterStaticData _UTF16StaticData = {
   1520    sizeof(UConverterStaticData),
   1521    "UTF-16",
   1522    1204, /* CCSID for BOM sensitive UTF-16 */
   1523    UCNV_IBM, UCNV_UTF16, 2, 2,
   1524 #if U_IS_BIG_ENDIAN
   1525    { 0xff, 0xfd, 0, 0 }, 2,
   1526 #else
   1527    { 0xfd, 0xff, 0, 0 }, 2,
   1528 #endif
   1529    false, false,
   1530    0,
   1531    0,
   1532    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1533 };
   1534 
   1535 const UConverterSharedData _UTF16Data =
   1536        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
   1537 
   1538 static const UConverterImpl _UTF16v2Impl = {
   1539    UCNV_UTF16,
   1540 
   1541    nullptr,
   1542    nullptr,
   1543 
   1544    _UTF16Open,
   1545    nullptr,
   1546    _UTF16Reset,
   1547 
   1548    _UTF16ToUnicodeWithOffsets,
   1549    _UTF16ToUnicodeWithOffsets,
   1550    _UTF16BEFromUnicodeWithOffsets,
   1551    _UTF16BEFromUnicodeWithOffsets,
   1552    _UTF16GetNextUChar,
   1553 
   1554    nullptr, /* ### TODO implement getStarters for all Unicode encodings?! */
   1555    _UTF16GetName,
   1556    nullptr,
   1557    nullptr,
   1558    ucnv_getNonSurrogateUnicodeSet,
   1559 
   1560    nullptr,
   1561    nullptr
   1562 };
   1563 
   1564 static const UConverterStaticData _UTF16v2StaticData = {
   1565    sizeof(UConverterStaticData),
   1566    "UTF-16,version=2",
   1567    1204, /* CCSID for BOM sensitive UTF-16 */
   1568    UCNV_IBM, UCNV_UTF16, 2, 2,
   1569    { 0xff, 0xfd, 0, 0 }, 2,
   1570    false, false,
   1571    0,
   1572    0,
   1573    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1574 };
   1575 
   1576 const UConverterSharedData _UTF16v2Data =
   1577        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
   1578 
   1579 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE