tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

unistr_cnv.cpp (12896B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  unistr_cnv.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:2
     14 *
     15 *   created on: 2004aug19
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Character conversion functions moved here from unistr.cpp
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_CONVERSION
     24 
     25 #include "unicode/putil.h"
     26 #include "cstring.h"
     27 #include "cmemory.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/unistr.h"
     30 #include "unicode/ucnv.h"
     31 #include "ucnv_imp.h"
     32 #include "putilimp.h"
     33 #include "ustr_cnv.h"
     34 #include "ustr_imp.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 //========================================
     39 // Constructors
     40 //========================================
     41 
     42 #if !U_CHARSET_IS_UTF8
     43 
     44 UnicodeString::UnicodeString(const char *codepageData) {
     45    fUnion.fFields.fLengthAndFlags = kShortString;
     46    if(codepageData != 0) {
     47        doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
     48    }
     49 }
     50 
     51 UnicodeString::UnicodeString(const char *codepageData,
     52                             int32_t dataLength) {
     53    fUnion.fFields.fLengthAndFlags = kShortString;
     54    if(codepageData != 0) {
     55        doCodepageCreate(codepageData, dataLength, 0);
     56    }
     57 }
     58 
     59 // else see unistr.cpp
     60 #endif
     61 
     62 UnicodeString::UnicodeString(const char *codepageData,
     63                             const char *codepage) {
     64    fUnion.fFields.fLengthAndFlags = kShortString;
     65    if (codepageData != nullptr) {
     66        doCodepageCreate(codepageData, static_cast<int32_t>(uprv_strlen(codepageData)), codepage);
     67    }
     68 }
     69 
     70 UnicodeString::UnicodeString(const char *codepageData,
     71                             int32_t dataLength,
     72                             const char *codepage) {
     73    fUnion.fFields.fLengthAndFlags = kShortString;
     74    if (codepageData != nullptr) {
     75        doCodepageCreate(codepageData, dataLength, codepage);
     76    }
     77 }
     78 
     79 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
     80                             UConverter *cnv,
     81                             UErrorCode &errorCode) {
     82    fUnion.fFields.fLengthAndFlags = kShortString;
     83    if(U_SUCCESS(errorCode)) {
     84        // check arguments
     85        if(src==nullptr) {
     86            // treat as an empty string, do nothing more
     87        } else if(srcLength<-1) {
     88            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     89        } else {
     90            // get input length
     91            if(srcLength==-1) {
     92                srcLength = static_cast<int32_t>(uprv_strlen(src));
     93            }
     94            if(srcLength>0) {
     95                if (cnv != nullptr) {
     96                    // use the provided converter
     97                    ucnv_resetToUnicode(cnv);
     98                    doCodepageCreate(src, srcLength, cnv, errorCode);
     99                } else {
    100                    // use the default converter
    101                    cnv=u_getDefaultConverter(&errorCode);
    102                    doCodepageCreate(src, srcLength, cnv, errorCode);
    103                    u_releaseDefaultConverter(cnv);
    104                }
    105            }
    106        }
    107 
    108        if(U_FAILURE(errorCode)) {
    109            setToBogus();
    110        }
    111    }
    112 }
    113 
    114 //========================================
    115 // Codeset conversion
    116 //========================================
    117 
    118 #if !U_CHARSET_IS_UTF8
    119 
    120 int32_t
    121 UnicodeString::extract(int32_t start,
    122                       int32_t length,
    123                       char *target,
    124                       uint32_t dstSize) const {
    125    return extract(start, length, target, dstSize, 0);
    126 }
    127 
    128 // else see unistr.cpp
    129 #endif
    130 
    131 int32_t
    132 UnicodeString::extract(int32_t start,
    133                       int32_t length,
    134                       char *target,
    135                       uint32_t dstSize,
    136                       const char *codepage) const
    137 {
    138    // if the arguments are illegal, then do nothing
    139    if (/*dstSize < 0 || */(dstSize > 0 && target == nullptr)) {
    140        return 0;
    141    }
    142 
    143    // pin the indices to legal values
    144    pinIndices(start, length);
    145 
    146    // We need to cast dstSize to int32_t for all subsequent code.
    147    // I don't know why the API was defined with uint32_t but we are stuck with it.
    148    // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
    149    // as a limit in some functions, it may wrap around and yield a pointer
    150    // that compares less-than target.
    151    int32_t capacity;
    152    if(dstSize < 0x7fffffff) {
    153        // Assume that the capacity is real and a limit pointer won't wrap around.
    154        capacity = static_cast<int32_t>(dstSize);
    155    } else {
    156        // Pin the capacity so that a limit pointer does not wrap around.
    157        char* targetLimit = static_cast<char*>(U_MAX_PTR(target));
    158        // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
    159        // greater than target and does not wrap around the top of the address space.
    160        capacity = static_cast<int32_t>(targetLimit - target);
    161    }
    162 
    163    // create the converter
    164    UConverter *converter;
    165    UErrorCode status = U_ZERO_ERROR;
    166 
    167    // just write the NUL if the string length is 0
    168    if(length == 0) {
    169        return u_terminateChars(target, capacity, 0, &status);
    170    }
    171 
    172    // if the codepage is the default, use our cache
    173    // if it is an empty string, then use the "invariant character" conversion
    174    if (codepage == nullptr) {
    175        const char *defaultName = ucnv_getDefaultName();
    176        if(UCNV_FAST_IS_UTF8(defaultName)) {
    177            return toUTF8(start, length, target, capacity);
    178        }
    179        converter = u_getDefaultConverter(&status);
    180    } else if (*codepage == 0) {
    181        // use the "invariant characters" conversion
    182        int32_t destLength;
    183        if(length <= capacity) {
    184            destLength = length;
    185        } else {
    186            destLength = capacity;
    187        }
    188        u_UCharsToChars(getArrayStart() + start, target, destLength);
    189        return u_terminateChars(target, capacity, length, &status);
    190    } else {
    191        converter = ucnv_open(codepage, &status);
    192    }
    193 
    194    length = doExtract(start, length, target, capacity, converter, status);
    195 
    196    // close the converter
    197    if (codepage == nullptr) {
    198        u_releaseDefaultConverter(converter);
    199    } else {
    200        ucnv_close(converter);
    201    }
    202 
    203    return length;
    204 }
    205 
    206 int32_t
    207 UnicodeString::extract(char *dest, int32_t destCapacity,
    208                       UConverter *cnv,
    209                       UErrorCode &errorCode) const
    210 {
    211    if(U_FAILURE(errorCode)) {
    212        return 0;
    213    }
    214 
    215    if (isBogus() || destCapacity < 0 || (destCapacity > 0 && dest == nullptr)) {
    216        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    217        return 0;
    218    }
    219 
    220    // nothing to do?
    221    if(isEmpty()) {
    222        return u_terminateChars(dest, destCapacity, 0, &errorCode);
    223    }
    224 
    225    // get the converter
    226    UBool isDefaultConverter;
    227    if (cnv == nullptr) {
    228        isDefaultConverter=true;
    229        cnv=u_getDefaultConverter(&errorCode);
    230        if(U_FAILURE(errorCode)) {
    231            return 0;
    232        }
    233    } else {
    234        isDefaultConverter=false;
    235        ucnv_resetFromUnicode(cnv);
    236    }
    237 
    238    // convert
    239    int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
    240 
    241    // release the converter
    242    if(isDefaultConverter) {
    243        u_releaseDefaultConverter(cnv);
    244    }
    245 
    246    return len;
    247 }
    248 
    249 int32_t
    250 UnicodeString::doExtract(int32_t start, int32_t length,
    251                         char *dest, int32_t destCapacity,
    252                         UConverter *cnv,
    253                         UErrorCode &errorCode) const
    254 {
    255    if(U_FAILURE(errorCode)) {
    256        if(destCapacity!=0) {
    257            *dest=0;
    258        }
    259        return 0;
    260    }
    261 
    262    const char16_t *src=getArrayStart()+start, *srcLimit=src+length;
    263    char *originalDest=dest;
    264    const char *destLimit;
    265 
    266    if(destCapacity==0) {
    267        destLimit=dest=nullptr;
    268    } else if(destCapacity==-1) {
    269        // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
    270        destLimit = static_cast<char*>(U_MAX_PTR(dest));
    271        // for NUL-termination, translate into highest int32_t
    272        destCapacity=0x7fffffff;
    273    } else {
    274        destLimit=dest+destCapacity;
    275    }
    276 
    277    // perform the conversion
    278    UErrorCode bufferStatus = U_ZERO_ERROR;
    279    ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &bufferStatus);
    280    length = static_cast<int32_t>(dest - originalDest);
    281 
    282    // if an overflow occurs, then get the preflighting length
    283    if(bufferStatus==U_BUFFER_OVERFLOW_ERROR) {
    284        char buffer[1024];
    285 
    286        destLimit=buffer+sizeof(buffer);
    287        do {
    288            dest=buffer;
    289            bufferStatus=U_ZERO_ERROR;
    290            ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, nullptr, true, &bufferStatus);
    291            length += static_cast<int32_t>(dest - buffer);
    292        } while(bufferStatus==U_BUFFER_OVERFLOW_ERROR);
    293    }
    294    if (U_FAILURE(bufferStatus)) {
    295        errorCode = bufferStatus;
    296    }
    297 
    298    return u_terminateChars(originalDest, destCapacity, length, &errorCode);
    299 }
    300 
    301 void
    302 UnicodeString::doCodepageCreate(const char *codepageData,
    303                                int32_t dataLength,
    304                                const char *codepage)
    305 {
    306    // if there's nothing to convert, do nothing
    307    if (codepageData == nullptr || dataLength == 0 || dataLength < -1) {
    308        return;
    309    }
    310    if(dataLength == -1) {
    311        dataLength = static_cast<int32_t>(uprv_strlen(codepageData));
    312    }
    313 
    314    UErrorCode status = U_ZERO_ERROR;
    315 
    316    // create the converter
    317    // if the codepage is the default, use our cache
    318    // if it is an empty string, then use the "invariant character" conversion
    319    UConverter *converter;
    320    if (codepage == nullptr) {
    321        const char *defaultName = ucnv_getDefaultName();
    322        if(UCNV_FAST_IS_UTF8(defaultName)) {
    323            setToUTF8(StringPiece(codepageData, dataLength));
    324            return;
    325        }
    326        converter = u_getDefaultConverter(&status);
    327    } else if (*codepage == 0) {
    328        // use the "invariant characters" conversion
    329        if(cloneArrayIfNeeded(dataLength, dataLength, false)) {
    330            u_charsToUChars(codepageData, getArrayStart(), dataLength);
    331            setLength(dataLength);
    332        } else {
    333            setToBogus();
    334        }
    335        return;
    336    } else {
    337        converter = ucnv_open(codepage, &status);
    338    }
    339 
    340    // if we failed, set the appropriate flags and return
    341    if(U_FAILURE(status)) {
    342        setToBogus();
    343        return;
    344    }
    345 
    346    // perform the conversion
    347    doCodepageCreate(codepageData, dataLength, converter, status);
    348    if(U_FAILURE(status)) {
    349        setToBogus();
    350    }
    351 
    352    // close the converter
    353    if (codepage == nullptr) {
    354        u_releaseDefaultConverter(converter);
    355    } else {
    356        ucnv_close(converter);
    357    }
    358 }
    359 
    360 void
    361 UnicodeString::doCodepageCreate(const char *codepageData,
    362                                int32_t dataLength,
    363                                UConverter *converter,
    364                                UErrorCode &status)
    365 {
    366    if(U_FAILURE(status)) {
    367        return;
    368    }
    369 
    370    // set up the conversion parameters
    371    const char *mySource     = codepageData;
    372    const char *mySourceEnd  = mySource + dataLength;
    373    char16_t *array, *myTarget;
    374 
    375    // estimate the size needed:
    376    int32_t arraySize;
    377    if(dataLength <= US_STACKBUF_SIZE) {
    378        // try to use the stack buffer
    379        arraySize = US_STACKBUF_SIZE;
    380    } else {
    381        // 1.25 char16_t's per source byte should cover most cases
    382        arraySize = dataLength + (dataLength >> 2);
    383    }
    384 
    385    // we do not care about the current contents
    386    UBool doCopyArray = false;
    387    for(;;) {
    388        if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
    389            setToBogus();
    390            break;
    391        }
    392 
    393        // perform the conversion
    394        array = getArrayStart();
    395        myTarget = array + length();
    396        UErrorCode bufferStatus = U_ZERO_ERROR;
    397        ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
    398            &mySource, mySourceEnd, nullptr, true, &bufferStatus);
    399 
    400        // update the conversion parameters
    401        setLength(static_cast<int32_t>(myTarget - array));
    402 
    403        // allocate more space and copy data, if needed
    404        if(bufferStatus == U_BUFFER_OVERFLOW_ERROR) {
    405            // keep the previous conversion results
    406            doCopyArray = true;
    407 
    408            // estimate the new size needed, larger than before
    409            // try 2 char16_t's per remaining source byte
    410            arraySize = static_cast<int32_t>(length() + 2 * (mySourceEnd - mySource));
    411        } else {
    412            if (U_FAILURE(bufferStatus)) {
    413                status = bufferStatus;
    414            }
    415            break;
    416        }
    417    }
    418 }
    419 
    420 U_NAMESPACE_END
    421 
    422 #endif