tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucnv_ext.cpp (40267B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2003-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ucnv_ext.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2003jun13
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Conversion extensions
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
     24 
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "ucnv_bld.h"
     28 #include "ucnv_cnv.h"
     29 #include "ucnv_ext.h"
     30 #include "cmemory.h"
     31 #include "uassert.h"
     32 
     33 /* to Unicode --------------------------------------------------------------- */
     34 
     35 /*
     36 * @return lookup value for the byte, if found; else 0
     37 */
     38 static inline uint32_t
     39 ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
     40    uint32_t word0, word;
     41    int32_t i, start, limit;
     42 
     43    /* check the input byte against the lowest and highest section bytes */
     44    start = static_cast<int32_t>(UCNV_EXT_TO_U_GET_BYTE(toUSection[0]));
     45    limit = static_cast<int32_t>(UCNV_EXT_TO_U_GET_BYTE(toUSection[length - 1]));
     46    if(byte<start || limit<byte) {
     47        return 0; /* the byte is out of range */
     48    }
     49 
     50    if(length==((limit-start)+1)) {
     51        /* direct access on a linear array */
     52        return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
     53    }
     54 
     55    /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
     56    word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);
     57 
     58    /*
     59     * Shift byte once instead of each section word and add 0xffffff.
     60     * We will compare the shifted/added byte (bbffffff) against
     61     * section words which have byte values in the same bit position.
     62     * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
     63     * for all v=0..f
     64     * so we need not mask off the lower 24 bits of each section word.
     65     */
     66    word=word0|UCNV_EXT_TO_U_VALUE_MASK;
     67 
     68    /* binary search */
     69    start=0;
     70    limit=length;
     71    for(;;) {
     72        i=limit-start;
     73        if(i<=1) {
     74            break; /* done */
     75        }
     76        /* start<limit-1 */
     77 
     78        if(i<=4) {
     79            /* linear search for the last part */
     80            if(word0<=toUSection[start]) {
     81                break;
     82            }
     83            if(++start<limit && word0<=toUSection[start]) {
     84                break;
     85            }
     86            if(++start<limit && word0<=toUSection[start]) {
     87                break;
     88            }
     89            /* always break at start==limit-1 */
     90            ++start;
     91            break;
     92        }
     93 
     94        i=(start+limit)/2;
     95        if(word<toUSection[i]) {
     96            limit=i;
     97        } else {
     98            start=i;
     99        }
    100    }
    101 
    102    /* did we really find it? */
    103    if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
    104        return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
    105    } else {
    106        return 0; /* not found */
    107    }
    108 }
    109 
    110 /*
    111 * true if not an SI/SO stateful converter,
    112 * or if the match length fits with the current converter state
    113 */
    114 #define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
    115    ((sisoState)<0 || ((sisoState)==0) == (match==1))
    116 
    117 /*
    118 * this works like ucnv_extMatchFromU() except
    119 * - the first character is in pre
    120 * - no trie is used
    121 * - the returned matchLength is not offset by 2
    122 */
    123 static int32_t
    124 ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
    125                 const char *pre, int32_t preLength,
    126                 const char *src, int32_t srcLength,
    127                 uint32_t *pMatchValue,
    128                 UBool /*useFallback*/, UBool flush) {
    129    const uint32_t *toUTable, *toUSection;
    130 
    131    uint32_t value, matchValue;
    132    int32_t i, j, idx, length, matchLength;
    133    uint8_t b;
    134 
    135    if(cx==nullptr || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
    136        return 0; /* no extension data, no match */
    137    }
    138 
    139    /* initialize */
    140    toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
    141    idx=0;
    142 
    143    matchValue=0;
    144    i=j=matchLength=0;
    145 
    146    if(sisoState==0) {
    147        /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
    148        if(preLength>1) {
    149            return 0; /* no match of a DBCS sequence in SBCS mode */
    150        } else if(preLength==1) {
    151            srcLength=0;
    152        } else /* preLength==0 */ {
    153            if(srcLength>1) {
    154                srcLength=1;
    155            }
    156        }
    157        flush=true;
    158    }
    159 
    160    /* we must not remember fallback matches when not using fallbacks */
    161 
    162    /* match input units until there is a full match or the input is consumed */
    163    for(;;) {
    164        /* go to the next section */
    165        toUSection=toUTable+idx;
    166 
    167        /* read first pair of the section */
    168        value=*toUSection++;
    169        length=UCNV_EXT_TO_U_GET_BYTE(value);
    170        value=UCNV_EXT_TO_U_GET_VALUE(value);
    171        if( value!=0 &&
    172            (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
    173             TO_U_USE_FALLBACK(useFallback)) &&
    174            UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
    175        ) {
    176            /* remember longest match so far */
    177            matchValue=value;
    178            matchLength=i+j;
    179        }
    180 
    181        /* match pre[] then src[] */
    182        if(i<preLength) {
    183            b = static_cast<uint8_t>(pre[i++]);
    184        } else if(j<srcLength) {
    185            b = static_cast<uint8_t>(src[j++]);
    186        } else {
    187            /* all input consumed, partial match */
    188            if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
    189                /*
    190                 * end of the entire input stream, stop with the longest match so far
    191                 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
    192                 * because it must fit into state buffers
    193                 */
    194                break;
    195            } else {
    196                /* continue with more input next time */
    197                return -length;
    198            }
    199        }
    200 
    201        /* search for the current char16_t */
    202        value=ucnv_extFindToU(toUSection, length, b);
    203        if(value==0) {
    204            /* no match here, stop with the longest match so far */
    205            break;
    206        } else {
    207            if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
    208                /* partial match, continue */
    209                idx = static_cast<int32_t>(UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value));
    210            } else {
    211                if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
    212                     TO_U_USE_FALLBACK(useFallback)) &&
    213                    UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
    214                ) {
    215                    /* full match, stop with result */
    216                    matchValue=value;
    217                    matchLength=i+j;
    218                } else {
    219                    /* full match on fallback not taken, stop with the longest match so far */
    220                }
    221                break;
    222            }
    223        }
    224    }
    225 
    226    if(matchLength==0) {
    227        /* no match at all */
    228        return 0;
    229    }
    230 
    231    /* return result */
    232    *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
    233    return matchLength;
    234 }
    235 
    236 static inline void
    237 ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
    238                 uint32_t value,
    239                 char16_t **target, const char16_t *targetLimit,
    240                 int32_t **offsets, int32_t srcIndex,
    241                 UErrorCode *pErrorCode) {
    242    /* output the result */
    243    if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
    244        /* output a single code point */
    245        ucnv_toUWriteCodePoint(
    246            cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
    247            target, targetLimit,
    248            offsets, srcIndex,
    249            pErrorCode);
    250    } else {
    251        /* output a string - with correct data we have resultLength>0 */
    252        ucnv_toUWriteUChars(
    253            cnv,
    254            UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, char16_t)+
    255                UCNV_EXT_TO_U_GET_INDEX(value),
    256            UCNV_EXT_TO_U_GET_LENGTH(value),
    257            target, targetLimit,
    258            offsets, srcIndex,
    259            pErrorCode);
    260    }
    261 }
    262 
    263 /*
    264 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
    265 * or 1 for DBCS-only,
    266 * or -1 if the converter is not SI/SO stateful
    267 *
    268 * Note: For SI/SO stateful converters getting here,
    269 * cnv->mode==0 is equivalent to firstLength==1.
    270 */
    271 #define UCNV_SISO_STATE(cnv) \
    272    ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
    273     (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)
    274 
    275 /*
    276 * target<targetLimit; set error code for overflow
    277 */
    278 U_CFUNC UBool
    279 ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
    280                        int32_t firstLength,
    281                        const char **src, const char *srcLimit,
    282                        char16_t **target, const char16_t *targetLimit,
    283                        int32_t **offsets, int32_t srcIndex,
    284                        UBool flush,
    285                        UErrorCode *pErrorCode) {
    286    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
    287    int32_t match;
    288 
    289    /* try to match */
    290    match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
    291                           (const char *)cnv->toUBytes, firstLength,
    292                           *src, (int32_t)(srcLimit-*src),
    293                           &value,
    294                           cnv->useFallback, flush);
    295    if(match>0) {
    296        /* advance src pointer for the consumed input */
    297        *src+=match-firstLength;
    298 
    299        /* write result to target */
    300        ucnv_extWriteToU(cnv, cx,
    301                         value,
    302                         target, targetLimit,
    303                         offsets, srcIndex,
    304                         pErrorCode);
    305        return true;
    306    } else if(match<0) {
    307        /* save state for partial match */
    308        const char *s;
    309        int32_t j;
    310 
    311        /* copy the first code point */
    312        s=(const char *)cnv->toUBytes;
    313        cnv->preToUFirstLength=(int8_t)firstLength;
    314        for(j=0; j<firstLength; ++j) {
    315            cnv->preToU[j]=*s++;
    316        }
    317 
    318        /* now copy the newly consumed input */
    319        s=*src;
    320        match=-match;
    321        for(; j<match; ++j) {
    322            cnv->preToU[j]=*s++;
    323        }
    324        *src=s; /* same as *src=srcLimit; because we reached the end of input */
    325        cnv->preToULength=(int8_t)match;
    326        return true;
    327    } else /* match==0 no match */ {
    328        return false;
    329    }
    330 }
    331 
    332 U_CFUNC UChar32
    333 ucnv_extSimpleMatchToU(const int32_t *cx,
    334                       const char *source, int32_t length,
    335                       UBool useFallback) {
    336    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
    337    int32_t match;
    338 
    339    if(length<=0) {
    340        return 0xffff;
    341    }
    342 
    343    /* try to match */
    344    match=ucnv_extMatchToU(cx, -1,
    345                           source, length,
    346                           nullptr, 0,
    347                           &value,
    348                           useFallback, true);
    349    if(match==length) {
    350        /* write result for simple, single-character conversion */
    351        if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
    352            return UCNV_EXT_TO_U_GET_CODE_POINT(value);
    353        }
    354    }
    355 
    356    /*
    357     * return no match because
    358     * - match>0 && value points to string: simple conversion cannot handle multiple code points
    359     * - match>0 && match!=length: not all input consumed, forbidden for this function
    360     * - match==0: no match found in the first place
    361     * - match<0: partial match, not supported for simple conversion (and flush==true)
    362     */
    363    return 0xfffe;
    364 }
    365 
    366 /*
    367 * continue partial match with new input
    368 * never called for simple, single-character conversion
    369 */
    370 U_CFUNC void
    371 ucnv_extContinueMatchToU(UConverter *cnv,
    372                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
    373                         UErrorCode *pErrorCode) {
    374    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
    375    int32_t match, length;
    376 
    377    match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
    378                           cnv->preToU, cnv->preToULength,
    379                           pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
    380                           &value,
    381                           cnv->useFallback, pArgs->flush);
    382    if(match>0) {
    383        if(match>=cnv->preToULength) {
    384            /* advance src pointer for the consumed input */
    385            pArgs->source+=match-cnv->preToULength;
    386            cnv->preToULength=0;
    387        } else {
    388            /* the match did not use all of preToU[] - keep the rest for replay */
    389            length=cnv->preToULength-match;
    390            uprv_memmove(cnv->preToU, cnv->preToU+match, length);
    391            cnv->preToULength=(int8_t)-length;
    392        }
    393 
    394        /* write result */
    395        ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
    396                         value,
    397                         &pArgs->target, pArgs->targetLimit,
    398                         &pArgs->offsets, srcIndex,
    399                         pErrorCode);
    400    } else if(match<0) {
    401        /* save state for partial match */
    402        const char *s;
    403        int32_t j;
    404 
    405        /* just _append_ the newly consumed input to preToU[] */
    406        s=pArgs->source;
    407        match=-match;
    408        for(j=cnv->preToULength; j<match; ++j) {
    409            cnv->preToU[j]=*s++;
    410        }
    411        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
    412        cnv->preToULength=(int8_t)match;
    413    } else /* match==0 */ {
    414        /*
    415         * no match
    416         *
    417         * We need to split the previous input into two parts:
    418         *
    419         * 1. The first codepage character is unmappable - that's how we got into
    420         *    trying the extension data in the first place.
    421         *    We need to move it from the preToU buffer
    422         *    to the error buffer, set an error code,
    423         *    and prepare the rest of the previous input for 2.
    424         *
    425         * 2. The rest of the previous input must be converted once we
    426         *    come back from the callback for the first character.
    427         *    At that time, we have to try again from scratch to convert
    428         *    these input characters.
    429         *    The replay will be handled by the ucnv.c conversion code.
    430         */
    431 
    432        /* move the first codepage character to the error field */
    433        uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
    434        cnv->toULength=cnv->preToUFirstLength;
    435 
    436        /* move the rest up inside the buffer */
    437        length=cnv->preToULength-cnv->preToUFirstLength;
    438        if(length>0) {
    439            uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
    440        }
    441 
    442        /* mark preToU for replay */
    443        cnv->preToULength=(int8_t)-length;
    444 
    445        /* set the error code for unassigned */
    446        *pErrorCode=U_INVALID_CHAR_FOUND;
    447    }
    448 }
    449 
    450 /* from Unicode ------------------------------------------------------------- */
    451 
    452 // Use roundtrips, "good one-way" mappings, and some normal fallbacks.
    453 static inline UBool
    454 extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
    455    return
    456        ((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
    457            FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
    458        (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
    459 }
    460 
    461 /*
    462 * @return index of the char16_t, if found; else <0
    463 */
    464 static inline int32_t
    465 ucnv_extFindFromU(const char16_t *fromUSection, int32_t length, char16_t u) {
    466    int32_t i, start, limit;
    467 
    468    /* binary search */
    469    start=0;
    470    limit=length;
    471    for(;;) {
    472        i=limit-start;
    473        if(i<=1) {
    474            break; /* done */
    475        }
    476        /* start<limit-1 */
    477 
    478        if(i<=4) {
    479            /* linear search for the last part */
    480            if(u<=fromUSection[start]) {
    481                break;
    482            }
    483            if(++start<limit && u<=fromUSection[start]) {
    484                break;
    485            }
    486            if(++start<limit && u<=fromUSection[start]) {
    487                break;
    488            }
    489            /* always break at start==limit-1 */
    490            ++start;
    491            break;
    492        }
    493 
    494        i=(start+limit)/2;
    495        if(u<fromUSection[i]) {
    496            limit=i;
    497        } else {
    498            start=i;
    499        }
    500    }
    501 
    502    /* did we really find it? */
    503    if(start<limit && u==fromUSection[start]) {
    504        return start;
    505    } else {
    506        return -1; /* not found */
    507    }
    508 }
    509 
    510 /*
    511 * @param cx pointer to extension data; if nullptr, returns 0
    512 * @param firstCP the first code point before all the other UChars
    513 * @param pre UChars that must match; !initialMatch: partial match with them
    514 * @param preLength length of pre, >=0
    515 * @param src UChars that can be used to complete a match
    516 * @param srcLength length of src, >=0
    517 * @param pMatchValue [out] output result value for the match from the data structure
    518 * @param useFallback "use fallback" flag, usually from cnv->useFallback
    519 * @param flush true if the end of the input stream is reached
    520 * @return >1: matched, return value=total match length (number of input units matched)
    521 *          1: matched, no mapping but request for <subchar1>
    522 *             (only for the first code point)
    523 *          0: no match
    524 *         <0: partial match, return value=negative total match length
    525 *             (partial matches are never returned for flush==true)
    526 *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
    527 *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
    528 *         further code units matched
    529 */
    530 static int32_t
    531 ucnv_extMatchFromU(const int32_t *cx,
    532                   UChar32 firstCP,
    533                   const char16_t *pre, int32_t preLength,
    534                   const char16_t *src, int32_t srcLength,
    535                   uint32_t *pMatchValue,
    536                   UBool useFallback, UBool flush) {
    537    const uint16_t *stage12, *stage3;
    538    const uint32_t *stage3b;
    539 
    540    const char16_t *fromUTableUChars, *fromUSectionUChars;
    541    const uint32_t *fromUTableValues, *fromUSectionValues;
    542 
    543    uint32_t value, matchValue;
    544    int32_t i, j, idx, length, matchLength;
    545    char16_t c;
    546 
    547    if(cx==nullptr) {
    548        return 0; /* no extension data, no match */
    549    }
    550 
    551    /* trie lookup of firstCP */
    552    idx=firstCP>>10; /* stage 1 index */
    553    if(idx>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
    554        return 0; /* the first code point is outside the trie */
    555    }
    556 
    557    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
    558    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
    559    idx=UCNV_EXT_FROM_U(stage12, stage3, idx, firstCP);
    560 
    561    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
    562    value=stage3b[idx];
    563    if(value==0) {
    564        return 0;
    565    }
    566 
    567    /*
    568     * Tests for (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0:
    569     * Do not interpret values with reserved bits used, for forward compatibility,
    570     * and do not even remember intermediate results with reserved bits used.
    571     */
    572 
    573    if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
    574        /* partial match, enter the loop below */
    575        idx = static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value));
    576 
    577        /* initialize */
    578        fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t);
    579        fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
    580 
    581        matchValue=0;
    582        i=j=matchLength=0;
    583 
    584        /* we must not remember fallback matches when not using fallbacks */
    585 
    586        /* match input units until there is a full match or the input is consumed */
    587        for(;;) {
    588            /* go to the next section */
    589            fromUSectionUChars=fromUTableUChars+idx;
    590            fromUSectionValues=fromUTableValues+idx;
    591 
    592            /* read first pair of the section */
    593            length=*fromUSectionUChars++;
    594            value=*fromUSectionValues++;
    595            if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
    596                /* remember longest match so far */
    597                matchValue=value;
    598                matchLength=2+i+j;
    599            }
    600 
    601            /* match pre[] then src[] */
    602            if(i<preLength) {
    603                c=pre[i++];
    604            } else if(j<srcLength) {
    605                c=src[j++];
    606            } else {
    607                /* all input consumed, partial match */
    608                if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
    609                    /*
    610                     * end of the entire input stream, stop with the longest match so far
    611                     * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
    612                     * because it must fit into state buffers
    613                     */
    614                    break;
    615                } else {
    616                    /* continue with more input next time */
    617                    return -(2+length);
    618                }
    619            }
    620 
    621            /* search for the current char16_t */
    622            idx=ucnv_extFindFromU(fromUSectionUChars, length, c);
    623            if(idx<0) {
    624                /* no match here, stop with the longest match so far */
    625                break;
    626            } else {
    627                value=fromUSectionValues[idx];
    628                if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
    629                    /* partial match, continue */
    630                    idx = static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value));
    631                } else {
    632                    if(extFromUUseMapping(useFallback, value, firstCP)) {
    633                        /* full match, stop with result */
    634                        matchValue=value;
    635                        matchLength=2+i+j;
    636                    } else {
    637                        /* full match on fallback not taken, stop with the longest match so far */
    638                    }
    639                    break;
    640                }
    641            }
    642        }
    643 
    644        if(matchLength==0) {
    645            /* no match at all */
    646            return 0;
    647        }
    648    } else /* result from firstCP trie lookup */ {
    649        if(extFromUUseMapping(useFallback, value, firstCP)) {
    650            /* full match, stop with result */
    651            matchValue=value;
    652            matchLength=2;
    653        } else {
    654            /* fallback not taken */
    655            return 0;
    656        }
    657    }
    658 
    659    /* return result */
    660    if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
    661        return 1; /* assert matchLength==2 */
    662    }
    663 
    664    *pMatchValue=matchValue;
    665    return matchLength;
    666 }
    667 
    668 /*
    669 * @param value fromUnicode mapping table value; ignores roundtrip and reserved bits
    670 */
    671 static inline void
    672 ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
    673                   uint32_t value,
    674                   char **target, const char *targetLimit,
    675                   int32_t **offsets, int32_t srcIndex,
    676                   UErrorCode *pErrorCode) {
    677    uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
    678    const uint8_t *result;
    679    int32_t length, prevLength;
    680 
    681    length=UCNV_EXT_FROM_U_GET_LENGTH(value);
    682    value = UCNV_EXT_FROM_U_GET_DATA(value);
    683 
    684    /* output the result */
    685    if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
    686        /*
    687         * Generate a byte array and then write it below.
    688         * This is not the fastest possible way, but it should be ok for
    689         * extension mappings, and it is much simpler.
    690         * Offset and overflow handling are only done once this way.
    691         */
    692        uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
    693        switch(length) {
    694        case 3:
    695            *p++ = static_cast<uint8_t>(value >> 16);
    696            U_FALLTHROUGH;
    697        case 2:
    698            *p++ = static_cast<uint8_t>(value >> 8);
    699            U_FALLTHROUGH;
    700        case 1:
    701            *p++ = static_cast<uint8_t>(value);
    702            U_FALLTHROUGH;
    703        default:
    704            break; /* will never occur */
    705        }
    706        result=buffer+1;
    707    } else {
    708        result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
    709    }
    710 
    711    /* with correct data we have length>0 */
    712 
    713    if((prevLength=cnv->fromUnicodeStatus)!=0) {
    714        /* handle SI/SO stateful output */
    715        uint8_t shiftByte;
    716 
    717        if(prevLength>1 && length==1) {
    718            /* change from double-byte mode to single-byte */
    719            shiftByte = static_cast<uint8_t>(UCNV_SI);
    720            cnv->fromUnicodeStatus=1;
    721        } else if(prevLength==1 && length>1) {
    722            /* change from single-byte mode to double-byte */
    723            shiftByte = static_cast<uint8_t>(UCNV_SO);
    724            cnv->fromUnicodeStatus=2;
    725        } else {
    726            shiftByte=0;
    727        }
    728 
    729        if(shiftByte!=0) {
    730            /* prepend the shift byte to the result bytes */
    731            buffer[0]=shiftByte;
    732            if(result!=buffer+1) {
    733                uprv_memcpy(buffer+1, result, length);
    734            }
    735            result=buffer;
    736            ++length;
    737        }
    738    }
    739 
    740    ucnv_fromUWriteBytes(cnv, reinterpret_cast<const char*>(result), length,
    741                         target, targetLimit,
    742                         offsets, srcIndex,
    743                         pErrorCode);
    744 }
    745 
    746 /*
    747 * target<targetLimit; set error code for overflow
    748 */
    749 U_CFUNC UBool
    750 ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
    751                          UChar32 cp,
    752                          const char16_t **src, const char16_t *srcLimit,
    753                          char **target, const char *targetLimit,
    754                          int32_t **offsets, int32_t srcIndex,
    755                          UBool flush,
    756                          UErrorCode *pErrorCode) {
    757    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
    758    int32_t match;
    759 
    760    /* try to match */
    761    match=ucnv_extMatchFromU(cx, cp,
    762                             nullptr, 0,
    763                             *src, (int32_t)(srcLimit-*src),
    764                             &value,
    765                             cnv->useFallback, flush);
    766 
    767    /* reject a match if the result is a single byte for DBCS-only */
    768    if( match>=2 &&
    769        !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
    770          cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
    771    ) {
    772        /* advance src pointer for the consumed input */
    773        *src+=match-2; /* remove 2 for the initial code point */
    774 
    775        /* write result to target */
    776        ucnv_extWriteFromU(cnv, cx,
    777                           value,
    778                           target, targetLimit,
    779                           offsets, srcIndex,
    780                           pErrorCode);
    781        return true;
    782    } else if(match<0) {
    783        /* save state for partial match */
    784        const char16_t *s;
    785        int32_t j;
    786 
    787        /* copy the first code point */
    788        cnv->preFromUFirstCP=cp;
    789 
    790        /* now copy the newly consumed input */
    791        s=*src;
    792        match=-match-2; /* remove 2 for the initial code point */
    793        for(j=0; j<match; ++j) {
    794            cnv->preFromU[j]=*s++;
    795        }
    796        *src=s; /* same as *src=srcLimit; because we reached the end of input */
    797        cnv->preFromULength=(int8_t)match;
    798        return true;
    799    } else if(match==1) {
    800        /* matched, no mapping but request for <subchar1> */
    801        cnv->useSubChar1=true;
    802        return false;
    803    } else /* match==0 no match */ {
    804        return false;
    805    }
    806 }
    807 
    808 /*
    809 * Used by ISO 2022 implementation.
    810 * @return number of bytes in *pValue; negative number if fallback; 0 for no mapping
    811 */
    812 U_CFUNC int32_t
    813 ucnv_extSimpleMatchFromU(const int32_t *cx,
    814                         UChar32 cp, uint32_t *pValue,
    815                         UBool useFallback) {
    816    uint32_t value;
    817    int32_t match;
    818 
    819    /* try to match */
    820    match=ucnv_extMatchFromU(cx,
    821                             cp,
    822                             nullptr, 0,
    823                             nullptr, 0,
    824                             &value,
    825                             useFallback, true);
    826    if(match>=2) {
    827        /* write result for simple, single-character conversion */
    828        int32_t length;
    829        int isRoundtrip;
    830 
    831        isRoundtrip=UCNV_EXT_FROM_U_IS_ROUNDTRIP(value);
    832        length=UCNV_EXT_FROM_U_GET_LENGTH(value);
    833        value = UCNV_EXT_FROM_U_GET_DATA(value);
    834 
    835        if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
    836            *pValue=value;
    837            return isRoundtrip ? length : -length;
    838 #if 0 /* not currently used */
    839        } else if(length==4) {
    840            /* de-serialize a 4-byte result */
    841            const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
    842            *pValue=
    843                ((uint32_t)result[0]<<24)|
    844                ((uint32_t)result[1]<<16)|
    845                ((uint32_t)result[2]<<8)|
    846                result[3];
    847            return isRoundtrip ? 4 : -4;
    848 #endif
    849        }
    850    }
    851 
    852    /*
    853     * return no match because
    854     * - match>1 && resultLength>4: result too long for simple conversion
    855     * - match==1: no match found, <subchar1> preferred
    856     * - match==0: no match found in the first place
    857     * - match<0: partial match, not supported for simple conversion (and flush==true)
    858     */
    859    return 0;
    860 }
    861 
    862 /*
    863 * continue partial match with new input, requires cnv->preFromUFirstCP>=0
    864 * never called for simple, single-character conversion
    865 */
    866 U_CFUNC void
    867 ucnv_extContinueMatchFromU(UConverter *cnv,
    868                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
    869                           UErrorCode *pErrorCode) {
    870    uint32_t value = 0;  /* initialize output-only param to 0 to silence gcc */
    871    int32_t match;
    872 
    873    match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
    874                             cnv->preFromUFirstCP,
    875                             cnv->preFromU, cnv->preFromULength,
    876                             pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
    877                             &value,
    878                             cnv->useFallback, pArgs->flush);
    879    if(match>=2) {
    880        match-=2; /* remove 2 for the initial code point */
    881 
    882        if(match>=cnv->preFromULength) {
    883            /* advance src pointer for the consumed input */
    884            pArgs->source+=match-cnv->preFromULength;
    885            cnv->preFromULength=0;
    886        } else {
    887            /* the match did not use all of preFromU[] - keep the rest for replay */
    888            int32_t length=cnv->preFromULength-match;
    889            u_memmove(cnv->preFromU, cnv->preFromU+match, length);
    890            cnv->preFromULength=(int8_t)-length;
    891        }
    892 
    893        /* finish the partial match */
    894        cnv->preFromUFirstCP=U_SENTINEL;
    895 
    896        /* write result */
    897        ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
    898                           value,
    899                           &pArgs->target, pArgs->targetLimit,
    900                           &pArgs->offsets, srcIndex,
    901                           pErrorCode);
    902    } else if(match<0) {
    903        /* save state for partial match */
    904        const char16_t *s;
    905        int32_t j;
    906 
    907        /* just _append_ the newly consumed input to preFromU[] */
    908        s=pArgs->source;
    909        match=-match-2; /* remove 2 for the initial code point */
    910        for(j=cnv->preFromULength; j<match; ++j) {
    911            U_ASSERT(j>=0);
    912            cnv->preFromU[j]=*s++;
    913        }
    914        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
    915        cnv->preFromULength=(int8_t)match;
    916    } else /* match==0 or 1 */ {
    917        /*
    918         * no match
    919         *
    920         * We need to split the previous input into two parts:
    921         *
    922         * 1. The first code point is unmappable - that's how we got into
    923         *    trying the extension data in the first place.
    924         *    We need to move it from the preFromU buffer
    925         *    to the error buffer, set an error code,
    926         *    and prepare the rest of the previous input for 2.
    927         *
    928         * 2. The rest of the previous input must be converted once we
    929         *    come back from the callback for the first code point.
    930         *    At that time, we have to try again from scratch to convert
    931         *    these input characters.
    932         *    The replay will be handled by the ucnv.c conversion code.
    933         */
    934 
    935        if(match==1) {
    936            /* matched, no mapping but request for <subchar1> */
    937            cnv->useSubChar1=true;
    938        }
    939 
    940        /* move the first code point to the error field */
    941        cnv->fromUChar32=cnv->preFromUFirstCP;
    942        cnv->preFromUFirstCP=U_SENTINEL;
    943 
    944        /* mark preFromU for replay */
    945        cnv->preFromULength=-cnv->preFromULength;
    946 
    947        /* set the error code for unassigned */
    948        *pErrorCode=U_INVALID_CHAR_FOUND;
    949    }
    950 }
    951 
    952 static UBool
    953 extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
    954    if(which==UCNV_ROUNDTRIP_SET) {
    955        // Add only code points for which the roundtrip flag is set.
    956        // Do not add any fallbacks, even if ucnv_fromUnicode() would use them
    957        // (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
    958        //
    959        // By analogy, also do not add "good one-way" mappings.
    960        //
    961        // Do not add entries with reserved bits set.
    962        if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
    963                UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
    964            return false;
    965        }
    966    } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
    967        // Do not add entries with reserved bits set.
    968        if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
    969            return false;
    970        }
    971    }
    972    // Do not add <subchar1> entries or other (future?) pseudo-entries
    973    // with an output length of 0.
    974    return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
    975 }
    976 
    977 static void
    978 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
    979                            const int32_t *cx,
    980                            const USetAdder *sa,
    981                            UConverterUnicodeSet which,
    982                            int32_t minLength,
    983                            UChar32 firstCP,
    984                            char16_t s[UCNV_EXT_MAX_UCHARS], int32_t length,
    985                            int32_t sectionIndex,
    986                            UErrorCode *pErrorCode) {
    987    const char16_t *fromUSectionUChars;
    988    const uint32_t *fromUSectionValues;
    989 
    990    uint32_t value;
    991    int32_t i, count;
    992 
    993    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, char16_t)+sectionIndex;
    994    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;
    995 
    996    /* read first pair of the section */
    997    count=*fromUSectionUChars++;
    998    value=*fromUSectionValues++;
    999 
   1000    if(extSetUseMapping(which, minLength, value)) {
   1001        if(length==U16_LENGTH(firstCP)) {
   1002            /* add the initial code point */
   1003            sa->add(sa->set, firstCP);
   1004        } else {
   1005            /* add the string so far */
   1006            sa->addString(sa->set, s, length);
   1007        }
   1008    }
   1009 
   1010    for(i=0; i<count; ++i) {
   1011        /* append this code unit and recurse or add the string */
   1012        s[length]=fromUSectionUChars[i];
   1013        value=fromUSectionValues[i];
   1014 
   1015        if(value==0) {
   1016            /* no mapping, do nothing */
   1017        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
   1018            ucnv_extGetUnicodeSetString(
   1019                sharedData, cx, sa, which, minLength,
   1020                firstCP, s, length+1,
   1021                static_cast<int32_t>(UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value)),
   1022                pErrorCode);
   1023        } else if(extSetUseMapping(which, minLength, value)) {
   1024            sa->addString(sa->set, s, length+1);
   1025        }
   1026    }
   1027 }
   1028 
   1029 U_CFUNC void
   1030 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
   1031                      const USetAdder *sa,
   1032                      UConverterUnicodeSet which,
   1033                      UConverterSetFilter filter,
   1034                      UErrorCode *pErrorCode) {
   1035    const int32_t *cx;
   1036    const uint16_t *stage12, *stage3, *ps2, *ps3;
   1037    const uint32_t *stage3b;
   1038 
   1039    uint32_t value;
   1040    int32_t st1, stage1Length, st2, st3, minLength;
   1041 
   1042    char16_t s[UCNV_EXT_MAX_UCHARS];
   1043    UChar32 c;
   1044    int32_t length;
   1045 
   1046    cx=sharedData->mbcs.extIndexes;
   1047    if(cx==nullptr) {
   1048        return;
   1049    }
   1050 
   1051    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
   1052    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
   1053    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
   1054 
   1055    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
   1056 
   1057    /* enumerate the from-Unicode trie table */
   1058    c=0; /* keep track of the current code point while enumerating */
   1059 
   1060    if(filter==UCNV_SET_FILTER_2022_CN) {
   1061        minLength=3;
   1062    } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
   1063               filter!=UCNV_SET_FILTER_NONE
   1064    ) {
   1065        /* DBCS-only, ignore single-byte results */
   1066        minLength=2;
   1067    } else {
   1068        minLength=1;
   1069    }
   1070 
   1071    /*
   1072     * the trie enumeration is almost the same as
   1073     * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
   1074     */
   1075    for(st1=0; st1<stage1Length; ++st1) {
   1076        st2=stage12[st1];
   1077        if(st2>stage1Length) {
   1078            ps2=stage12+st2;
   1079            for(st2=0; st2<64; ++st2) {
   1080                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
   1081                    /* read the stage 3 block */
   1082                    ps3=stage3+st3;
   1083 
   1084                    do {
   1085                        value=stage3b[*ps3++];
   1086                        if(value==0) {
   1087                            /* no mapping, do nothing */
   1088                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
   1089                            // Recurse for partial results.
   1090                            length=0;
   1091                            U16_APPEND_UNSAFE(s, length, c);
   1092                            ucnv_extGetUnicodeSetString(
   1093                                sharedData, cx, sa, which, minLength,
   1094                                c, s, length,
   1095                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
   1096                                pErrorCode);
   1097                        } else if(extSetUseMapping(which, minLength, value)) {
   1098                            switch(filter) {
   1099                            case UCNV_SET_FILTER_2022_CN:
   1100                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
   1101                                    continue;
   1102                                }
   1103                                break;
   1104                            case UCNV_SET_FILTER_SJIS:
   1105                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
   1106                                    continue;
   1107                                }
   1108                                break;
   1109                            case UCNV_SET_FILTER_GR94DBCS:
   1110                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
   1111                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
   1112                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
   1113                                    continue;
   1114                                }
   1115                                break;
   1116                            case UCNV_SET_FILTER_HZ:
   1117                                if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
   1118                                     (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
   1119                                     (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
   1120                                    continue;
   1121                                }
   1122                                break;
   1123                            default:
   1124                                /*
   1125                                 * UCNV_SET_FILTER_NONE,
   1126                                 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
   1127                                 */
   1128                                break;
   1129                            }
   1130                            sa->add(sa->set, c);
   1131                        }
   1132                    } while((++c&0xf)!=0);
   1133                } else {
   1134                    c+=16; /* empty stage 3 block */
   1135                }
   1136            }
   1137        } else {
   1138            c+=1024; /* empty stage 2 block */
   1139        }
   1140    }
   1141 }
   1142 
   1143 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */