[ tor-browser ].git.dasho

unames.cpp (68527B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  unames.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 1999oct04
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/putil.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/udata.h"
     23 #include "unicode/utf.h"
     24 #include "unicode/utf16.h"
     25 #include "uassert.h"
     26 #include "ustr_imp.h"
     27 #include "umutex.h"
     28 #include "cmemory.h"
     29 #include "cstring.h"
     30 #include "ucln_cmn.h"
     31 #include "udataswp.h"
     32 #include "uprops.h"
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 /* prototypes ------------------------------------------------------------- */
     37 
     38 static const char DATA_NAME[] = "unames";
     39 static const char DATA_TYPE[] = "icu";
     40 
     41 #define GROUP_SHIFT 5
     42 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
     43 #define GROUP_MASK (LINES_PER_GROUP-1)
     44 
     45 /*
     46 * This struct was replaced by explicitly accessing equivalent
     47 * fields from triples of uint16_t.
     48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
     49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
     50 * would advance by 6 bytes (3 uint16_t).
     51 *
     52 * We can't just change the data structure because it's loaded from a data file,
     53 * and we don't want to make it less compact, so we changed the access code.
     54 *
     55 * For details see ICU tickets 6331 and 6008.
     56 typedef struct {
     57    uint16_t groupMSB,
     58             offsetHigh, offsetLow; / * avoid padding * /
     59 } Group;
     60 */
     61 enum {
     62    GROUP_MSB,
     63    GROUP_OFFSET_HIGH,
     64    GROUP_OFFSET_LOW,
     65    GROUP_LENGTH
     66 };
     67 
     68 /*
     69 * Get the 32-bit group offset.
     70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
     71 * @return group offset (int32_t)
     72 */
     73 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
     74 
     75 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
     76 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
     77 
     78 typedef struct {
     79    uint32_t start, end;
     80    uint8_t type, variant;
     81    uint16_t size;
     82 } AlgorithmicRange;
     83 
     84 typedef struct {
     85    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
     86 } UCharNames;
     87 
     88 /*
     89 * Get the groups table from a UCharNames struct.
     90 * The groups table consists of one uint16_t groupCount followed by
     91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
     92 * and the comment for the old struct Group above.
     93 *
     94 * @param names (const UCharNames *) pointer to the UCharNames indexes
     95 * @return (const uint16_t *) pointer to the groups table
     96 */
     97 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
     98 
     99 typedef struct {
    100    const char *otherName;
    101    UChar32 code;
    102 } FindName;
    103 
    104 #define DO_FIND_NAME nullptr
    105 
    106 static UDataMemory *uCharNamesData=nullptr;
    107 static UCharNames *uCharNames=nullptr;
    108 static icu::UInitOnce gCharNamesInitOnce {};
    109 
    110 /*
    111 * Maximum length of character names (regular & 1.0).
    112 */
    113 static int32_t gMaxNameLength=0;
    114 
    115 /*
    116 * Set of chars used in character names (regular & 1.0).
    117 * Chars are platform-dependent (can be EBCDIC).
    118 */
    119 static uint32_t gNameSet[8]={ 0 };
    120 
    121 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
    122 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
    123 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
    124 
    125 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
    126 
    127 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
    128    "unassigned",
    129    "uppercase letter",
    130    "lowercase letter",
    131    "titlecase letter",
    132    "modifier letter",
    133    "other letter",
    134    "non spacing mark",
    135    "enclosing mark",
    136    "combining spacing mark",
    137    "decimal digit number",
    138    "letter number",
    139    "other number",
    140    "space separator",
    141    "line separator",
    142    "paragraph separator",
    143    "control",
    144    "format",
    145    "private use area",
    146    "surrogate",
    147    "dash punctuation",   
    148    "start punctuation",
    149    "end punctuation",
    150    "connector punctuation",
    151    "other punctuation",
    152    "math symbol",
    153    "currency symbol",
    154    "modifier symbol",
    155    "other symbol",
    156    "initial punctuation",
    157    "final punctuation",
    158    "noncharacter",
    159    "lead surrogate",
    160    "trail surrogate"
    161 };
    162 
    163 /* implementation ----------------------------------------------------------- */
    164 
    165 static UBool U_CALLCONV unames_cleanup()
    166 {
    167    if(uCharNamesData) {
    168        udata_close(uCharNamesData);
    169        uCharNamesData = nullptr;
    170    }
    171    if(uCharNames) {
    172        uCharNames = nullptr;
    173    }
    174    gCharNamesInitOnce.reset();
    175    gMaxNameLength=0;
    176    return true;
    177 }
    178 
    179 static UBool U_CALLCONV
    180 isAcceptable(void * /*context*/,
    181             const char * /*type*/, const char * /*name*/,
    182             const UDataInfo *pInfo) {
    183    return
    184        pInfo->size>=20 &&
    185        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    186        pInfo->charsetFamily==U_CHARSET_FAMILY &&
    187        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
    188        pInfo->dataFormat[1]==0x6e &&
    189        pInfo->dataFormat[2]==0x61 &&
    190        pInfo->dataFormat[3]==0x6d &&
    191        pInfo->formatVersion[0]==1;
    192 }
    193 
    194 static void U_CALLCONV
    195 loadCharNames(UErrorCode &status) {
    196    U_ASSERT(uCharNamesData == nullptr);
    197    U_ASSERT(uCharNames == nullptr);
    198 
    199    uCharNamesData = udata_openChoice(nullptr, DATA_TYPE, DATA_NAME, isAcceptable, nullptr, &status);
    200    if(U_FAILURE(status)) {
    201        uCharNamesData = nullptr;
    202    } else {
    203        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
    204    }
    205    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
    206 }
    207 
    208 
    209 static UBool
    210 isDataLoaded(UErrorCode *pErrorCode) {
    211    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
    212    return U_SUCCESS(*pErrorCode);
    213 }
    214 
    215 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \
    216    if((bufferLength)>0) { \
    217        *(buffer)++=c; \
    218        --(bufferLength); \
    219    } \
    220    ++(bufferPos); \
    221 } UPRV_BLOCK_MACRO_END
    222 
    223 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
    224 
    225 /*
    226 * Important: expandName() and compareName() are almost the same -
    227 * apply fixes to both.
    228 *
    229 * UnicodeData.txt uses ';' as a field separator, so no
    230 * field can contain ';' as part of its contents.
    231 * In unames.dat, it is marked as token[';']==-1 only if the
    232 * semicolon is used in the data file - which is iff we
    233 * have Unicode 1.0 names or ISO comments or aliases.
    234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
    235 * although we know that it will never be part of a name.
    236 */
    237 static uint16_t
    238 expandName(UCharNames *names,
    239           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
    240           char *buffer, uint16_t bufferLength) {
    241    uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
    242    uint16_t token, tokenCount=*tokens++, bufferPos=0;
    243    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
    244    uint8_t c;
    245 
    246    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
    247        /*
    248         * skip the modern name if it is not requested _and_
    249         * if the semicolon byte value is a character, not a token number
    250         */
    251        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
    252            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
    253            do {
    254                while(nameLength>0) {
    255                    --nameLength;
    256                    if(*name++==';') {
    257                        break;
    258                    }
    259                }
    260            } while(--fieldIndex>0);
    261        } else {
    262            /*
    263             * the semicolon byte value is a token number, therefore
    264             * only modern names are stored in unames.dat and there is no
    265             * such requested alternate name here
    266             */
    267            nameLength=0;
    268        }
    269    }
    270 
    271    /* write each letter directly, and write a token word per token */
    272    while(nameLength>0) {
    273        --nameLength;
    274        c=*name++;
    275 
    276        if(c>=tokenCount) {
    277            if(c!=';') {
    278                /* implicit letter */
    279                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    280            } else {
    281                /* finished */
    282                break;
    283            }
    284        } else {
    285            token=tokens[c];
    286            if (token == static_cast<uint16_t>(-2)) {
    287                /* this is a lead byte for a double-byte token */
    288                token=tokens[c<<8|*name++];
    289                --nameLength;
    290            }
    291            if (token == static_cast<uint16_t>(-1)) {
    292                if(c!=';') {
    293                    /* explicit letter */
    294                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    295                } else {
    296                    /* stop, but skip the semicolon if we are seeking
    297                       extended names and there was no 2.0 name but there
    298                       is a 1.0 name. */
    299                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
    300                        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
    301                            continue;
    302                        }
    303                    }
    304                    /* finished */
    305                    break;
    306                }
    307            } else {
    308                /* write token word */
    309                uint8_t *tokenString=tokenStrings+token;
    310                while((c=*tokenString++)!=0) {
    311                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    312                }
    313            }
    314        }
    315    }
    316 
    317    /* zero-terminate */
    318    if(bufferLength>0) {
    319        *buffer=0;
    320    }
    321 
    322    return bufferPos;
    323 }
    324 
    325 /*
    326 * compareName() is almost the same as expandName() except that it compares
    327 * the currently expanded name to an input name.
    328 * It returns the match/no match result as soon as possible.
    329 */
    330 static UBool
    331 compareName(UCharNames *names,
    332            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
    333            const char *otherName) {
    334    uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
    335    uint16_t token, tokenCount=*tokens++;
    336    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
    337    uint8_t c;
    338    const char *origOtherName = otherName;
    339 
    340    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
    341        /*
    342         * skip the modern name if it is not requested _and_
    343         * if the semicolon byte value is a character, not a token number
    344         */
    345        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
    346            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
    347            do {
    348                while(nameLength>0) {
    349                    --nameLength;
    350                    if(*name++==';') {
    351                        break;
    352                    }
    353                }
    354            } while(--fieldIndex>0);
    355        } else {
    356            /*
    357             * the semicolon byte value is a token number, therefore
    358             * only modern names are stored in unames.dat and there is no
    359             * such requested alternate name here
    360             */
    361            nameLength=0;
    362        }
    363    }
    364 
    365    /* compare each letter directly, and compare a token word per token */
    366    while(nameLength>0) {
    367        --nameLength;
    368        c=*name++;
    369 
    370        if(c>=tokenCount) {
    371            if(c!=';') {
    372                /* implicit letter */
    373                if (static_cast<char>(c) != *otherName++) {
    374                    return false;
    375                }
    376            } else {
    377                /* finished */
    378                break;
    379            }
    380        } else {
    381            token=tokens[c];
    382            if (token == static_cast<uint16_t>(-2)) {
    383                /* this is a lead byte for a double-byte token */
    384                token=tokens[c<<8|*name++];
    385                --nameLength;
    386            }
    387            if (token == static_cast<uint16_t>(-1)) {
    388                if(c!=';') {
    389                    /* explicit letter */
    390                    if (static_cast<char>(c) != *otherName++) {
    391                        return false;
    392                    }
    393                } else {
    394                    /* stop, but skip the semicolon if we are seeking
    395                       extended names and there was no 2.0 name but there
    396                       is a 1.0 name. */
    397                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
    398                        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
    399                            continue;
    400                        }
    401                    }
    402                    /* finished */
    403                    break;
    404                }
    405            } else {
    406                /* write token word */
    407                uint8_t *tokenString=tokenStrings+token;
    408                while((c=*tokenString++)!=0) {
    409                    if (static_cast<char>(c) != *otherName++) {
    410                        return false;
    411                    }
    412                }
    413            }
    414        }
    415    }
    416 
    417    /* complete match? */
    418    return *otherName == 0;
    419 }
    420 
    421 static uint8_t getCharCat(UChar32 cp) {
    422    uint8_t cat;
    423 
    424    if (U_IS_UNICODE_NONCHAR(cp)) {
    425        return U_NONCHARACTER_CODE_POINT;
    426    }
    427 
    428    if ((cat = u_charType(cp)) == U_SURROGATE) {
    429        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
    430    }
    431 
    432    return cat;
    433 }
    434 
    435 static const char *getCharCatName(UChar32 cp) {
    436    uint8_t cat = getCharCat(cp);
    437 
    438    /* Return unknown if the table of names above is not up to
    439       date. */
    440 
    441    if (cat >= UPRV_LENGTHOF(charCatNames)) {
    442        return "unknown";
    443    } else {
    444        return charCatNames[cat];
    445    }
    446 }
    447 
    448 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
    449    const char *catname = getCharCatName(code);
    450    uint16_t length = 0;
    451 
    452    UChar32 cp;
    453    int ndigits, i;
    454    
    455    WRITE_CHAR(buffer, bufferLength, length, '<');
    456    while (catname[length - 1]) {
    457        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
    458    }
    459    WRITE_CHAR(buffer, bufferLength, length, '-');
    460    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
    461        ;
    462    if (ndigits < 4)
    463        ndigits = 4;
    464    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
    465        uint8_t v = static_cast<uint8_t>(cp & 0xf);
    466        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
    467    }
    468    buffer += ndigits;
    469    length += static_cast<uint16_t>(ndigits);
    470    WRITE_CHAR(buffer, bufferLength, length, '>');
    471 
    472    return length;
    473 }
    474 
    475 /*
    476 * getGroup() does a binary search for the group that contains the
    477 * Unicode code point "code".
    478 * The return value is always a valid Group* that may contain "code"
    479 * or else is the highest group before "code".
    480 * If the lowest group is after "code", then that one is returned.
    481 */
    482 static const uint16_t *
    483 getGroup(UCharNames *names, uint32_t code) {
    484    const uint16_t *groups=GET_GROUPS(names);
    485    uint16_t groupMSB = static_cast<uint16_t>(code >> GROUP_SHIFT),
    486             start=0,
    487             limit=*groups++,
    488             number;
    489 
    490    /* binary search for the group of names that contains the one for code */
    491    while(start<limit-1) {
    492        number = static_cast<uint16_t>((start + limit) / 2);
    493        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
    494            limit=number;
    495        } else {
    496            start=number;
    497        }
    498    }
    499 
    500    /* return this regardless of whether it is an exact match */
    501    return groups+start*GROUP_LENGTH;
    502 }
    503 
    504 /*
    505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
    506 * expands them into offsets and lengths for each string.
    507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
    508 * If a nibble<0xc, then it is the length itself (0=empty string).
    509 * If a nibble>=0xc, then it forms a length value with the following nibble.
    510 * Calculation see below.
    511 * The offsets and lengths arrays must be at least 33 (one more) long because
    512 * there is no check here at the end if the last nibble is still used.
    513 */
    514 static const uint8_t *
    515 expandGroupLengths(const uint8_t *s,
    516                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
    517    /* read the lengths of the 32 strings in this group and get each string's offset */
    518    uint16_t i=0, offset=0, length=0;
    519    uint8_t lengthByte;
    520 
    521    /* all 32 lengths must be read to get the offset of the first group string */
    522    while(i<LINES_PER_GROUP) {
    523        lengthByte=*s++;
    524 
    525        /* read even nibble - MSBs of lengthByte */
    526        if(length>=12) {
    527            /* double-nibble length spread across two bytes */
    528            length = static_cast<uint16_t>(((length & 0x3) << 4 | lengthByte >> 4) + 12);
    529            lengthByte&=0xf;
    530        } else if((lengthByte /* &0xf0 */)>=0xc0) {
    531            /* double-nibble length spread across this one byte */
    532            length = static_cast<uint16_t>((lengthByte & 0x3f) + 12);
    533        } else {
    534            /* single-nibble length in MSBs */
    535            length = static_cast<uint16_t>(lengthByte >> 4);
    536            lengthByte&=0xf;
    537        }
    538 
    539        *offsets++=offset;
    540        *lengths++=length;
    541 
    542        offset+=length;
    543        ++i;
    544 
    545        /* read odd nibble - LSBs of lengthByte */
    546        if((lengthByte&0xf0)==0) {
    547            /* this nibble was not consumed for a double-nibble length above */
    548            length=lengthByte;
    549            if(length<12) {
    550                /* single-nibble length in LSBs */
    551                *offsets++=offset;
    552                *lengths++=length;
    553 
    554                offset+=length;
    555                ++i;
    556            }
    557        } else {
    558            length=0;   /* prevent double-nibble detection in the next iteration */
    559        }
    560    }
    561 
    562    /* now, s is at the first group string */
    563    return s;
    564 }
    565 
    566 static uint16_t
    567 expandGroupName(UCharNames *names, const uint16_t *group,
    568                uint16_t lineNumber, UCharNameChoice nameChoice,
    569                char *buffer, uint16_t bufferLength) {
    570    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
    571    const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
    572    s=expandGroupLengths(s, offsets, lengths);
    573    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
    574                      buffer, bufferLength);
    575 }
    576 
    577 static uint16_t
    578 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
    579        char *buffer, uint16_t bufferLength) {
    580    const uint16_t *group=getGroup(names, code);
    581    if (static_cast<uint16_t>(code >> GROUP_SHIFT) == group[GROUP_MSB]) {
    582        return expandGroupName(names, group, static_cast<uint16_t>(code & GROUP_MASK), nameChoice,
    583                               buffer, bufferLength);
    584    } else {
    585        /* group not found */
    586        /* zero-terminate */
    587        if(bufferLength>0) {
    588            *buffer=0;
    589        }
    590        return 0;
    591    }
    592 }
    593 
    594 /*
    595 * enumGroupNames() enumerates all the names in a 32-group
    596 * and either calls the enumerator function or finds a given input name.
    597 */
    598 static UBool
    599 enumGroupNames(UCharNames *names, const uint16_t *group,
    600               UChar32 start, UChar32 end,
    601               UEnumCharNamesFn *fn, void *context,
    602               UCharNameChoice nameChoice) {
    603    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
    604    const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
    605 
    606    s=expandGroupLengths(s, offsets, lengths);
    607    if(fn!=DO_FIND_NAME) {
    608        char buffer[200];
    609        uint16_t length;
    610 
    611        while(start<=end) {
    612            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
    613            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
    614                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
    615            }
    616            /* here, we assume that the buffer is large enough */
    617            if(length>0) {
    618                if(!fn(context, start, nameChoice, buffer, length)) {
    619                    return false;
    620                }
    621            }
    622            ++start;
    623        }
    624    } else {
    625        const char* otherName = static_cast<FindName*>(context)->otherName;
    626        while(start<=end) {
    627            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
    628                static_cast<FindName*>(context)->code = start;
    629                return false;
    630            }
    631            ++start;
    632        }
    633    }
    634    return true;
    635 }
    636 
    637 /*
    638 * enumExtNames enumerate extended names.
    639 * It only needs to do it if it is called with a real function and not
    640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
    641 * for extended names by itself.
    642 */ 
    643 static UBool
    644 enumExtNames(UChar32 start, UChar32 end,
    645             UEnumCharNamesFn *fn, void *context)
    646 {
    647    if(fn!=DO_FIND_NAME) {
    648        char buffer[200];
    649        uint16_t length;
    650        
    651        while(start<=end) {
    652            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
    653            /* here, we assume that the buffer is large enough */
    654            if(length>0) {
    655                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
    656                    return false;
    657                }
    658            }
    659            ++start;
    660        }
    661    }
    662 
    663    return true;
    664 }
    665 
    666 static UBool
    667 enumNames(UCharNames *names,
    668          UChar32 start, UChar32 limit,
    669          UEnumCharNamesFn *fn, void *context,
    670          UCharNameChoice nameChoice) {
    671    uint16_t startGroupMSB, endGroupMSB, groupCount;
    672    const uint16_t *group, *groupLimit;
    673 
    674    startGroupMSB = static_cast<uint16_t>(start >> GROUP_SHIFT);
    675    endGroupMSB = static_cast<uint16_t>((limit - 1) >> GROUP_SHIFT);
    676 
    677    /* find the group that contains start, or the highest before it */
    678    group=getGroup(names, start);
    679 
    680    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
    681        /* enumerate synthetic names between start and the group start */
    682        UChar32 extLimit = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
    683        if(extLimit>limit) {
    684            extLimit=limit;
    685        }
    686        if(!enumExtNames(start, extLimit-1, fn, context)) {
    687            return false;
    688        }
    689        start=extLimit;
    690    }
    691 
    692    if(startGroupMSB==endGroupMSB) {
    693        if(startGroupMSB==group[GROUP_MSB]) {
    694            /* if start and limit-1 are in the same group, then enumerate only in that one */
    695            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
    696        }
    697    } else {
    698        const uint16_t *groups=GET_GROUPS(names);
    699        groupCount=*groups++;
    700        groupLimit=groups+groupCount*GROUP_LENGTH;
    701 
    702        if(startGroupMSB==group[GROUP_MSB]) {
    703            /* enumerate characters in the partial start group */
    704            if((start&GROUP_MASK)!=0) {
    705                if(!enumGroupNames(names, group,
    706                                   start, (static_cast<UChar32>(startGroupMSB) << GROUP_SHIFT) + LINES_PER_GROUP - 1,
    707                                   fn, context, nameChoice)) {
    708                    return false;
    709                }
    710                group=NEXT_GROUP(group); /* continue with the next group */
    711            }
    712        } else if(startGroupMSB>group[GROUP_MSB]) {
    713            /* make sure that we start enumerating with the first group after start */
    714            const uint16_t *nextGroup=NEXT_GROUP(group);
    715            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
    716                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
    717                if (end > limit) {
    718                    end = limit;
    719                }
    720                if (!enumExtNames(start, end - 1, fn, context)) {
    721                    return false;
    722                }
    723            }
    724            group=nextGroup;
    725        }
    726 
    727        /* enumerate entire groups between the start- and end-groups */
    728        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
    729            const uint16_t *nextGroup;
    730            start = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
    731            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
    732                return false;
    733            }
    734            nextGroup=NEXT_GROUP(group);
    735            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
    736                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
    737                if (end > limit) {
    738                    end = limit;
    739                }
    740                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
    741                    return false;
    742                }
    743            }
    744            group=nextGroup;
    745        }
    746 
    747        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
    748        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
    749            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
    750        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
    751            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
    752            if (next > start) {
    753                start = next;
    754            }
    755        } else {
    756            return true;
    757        }
    758    }
    759 
    760    /* we have not found a group, which means everything is made of
    761       extended names. */
    762    if (nameChoice == U_EXTENDED_CHAR_NAME) {
    763        if (limit > UCHAR_MAX_VALUE + 1) {
    764            limit = UCHAR_MAX_VALUE + 1;
    765        }
    766        return enumExtNames(start, limit - 1, fn, context);
    767    }
    768    
    769    return true;
    770 }
    771 
    772 static uint16_t
    773 writeFactorSuffix(const uint16_t *factors, uint16_t count,
    774                  const char *s, /* suffix elements */
    775                  uint32_t code,
    776                  uint16_t indexes[8], /* output fields from here */
    777                  const char *elementBases[8], const char *elements[8],
    778                  char *buffer, uint16_t bufferLength) {
    779    uint16_t i, factor, bufferPos=0;
    780    char c;
    781 
    782    /* write elements according to the factors */
    783 
    784    /*
    785     * the factorized elements are determined by modulo arithmetic
    786     * with the factors of this algorithm
    787     *
    788     * note that for fewer operations, count is decremented here
    789     */
    790    --count;
    791    for(i=count; i>0; --i) {
    792        factor=factors[i];
    793        indexes[i] = static_cast<uint16_t>(code % factor);
    794        code/=factor;
    795    }
    796    /*
    797     * we don't need to calculate the last modulus because start<=code<=end
    798     * guarantees here that code<=factors[0]
    799     */
    800    indexes[0] = static_cast<uint16_t>(code);
    801 
    802    /* write each element */
    803    for(;;) {
    804        if(elementBases!=nullptr) {
    805            *elementBases++=s;
    806        }
    807 
    808        /* skip indexes[i] strings */
    809        factor=indexes[i];
    810        while(factor>0) {
    811            while(*s++!=0) {}
    812            --factor;
    813        }
    814        if(elements!=nullptr) {
    815            *elements++=s;
    816        }
    817 
    818        /* write element */
    819        while((c=*s++)!=0) {
    820            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    821        }
    822 
    823        /* we do not need to perform the rest of this loop for i==count - break here */
    824        if(i>=count) {
    825            break;
    826        }
    827 
    828        /* skip the rest of the strings for this factors[i] */
    829        factor = static_cast<uint16_t>(factors[i] - indexes[i] - 1);
    830        while(factor>0) {
    831            while(*s++!=0) {}
    832            --factor;
    833        }
    834 
    835        ++i;
    836    }
    837 
    838    /* zero-terminate */
    839    if(bufferLength>0) {
    840        *buffer=0;
    841    }
    842 
    843    return bufferPos;
    844 }
    845 
    846 /*
    847 * Important:
    848 * Parts of findAlgName() are almost the same as some of getAlgName().
    849 * Fixes must be applied to both.
    850 */
    851 static uint16_t
    852 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
    853        char *buffer, uint16_t bufferLength) {
    854    uint16_t bufferPos=0;
    855 
    856    /* Only the normative character name can be algorithmic. */
    857    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
    858        /* zero-terminate */
    859        if(bufferLength>0) {
    860            *buffer=0;
    861        }
    862        return 0;
    863    }
    864 
    865    switch(range->type) {
    866    case 0: {
    867        /* name = prefix hex-digits */
    868        const char* s = reinterpret_cast<const char*>(range + 1);
    869        char c;
    870 
    871        uint16_t i, count;
    872 
    873        /* copy prefix */
    874        while((c=*s++)!=0) {
    875            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    876        }
    877 
    878        /* write hexadecimal code point value */
    879        count=range->variant;
    880 
    881        /* zero-terminate */
    882        if(count<bufferLength) {
    883            buffer[count]=0;
    884        }
    885 
    886        for(i=count; i>0;) {
    887            if(--i<bufferLength) {
    888                c = static_cast<char>(code & 0xf);
    889                if(c<10) {
    890                    c+='0';
    891                } else {
    892                    c+='A'-10;
    893                }
    894                buffer[i]=c;
    895            }
    896            code>>=4;
    897        }
    898 
    899        bufferPos+=count;
    900        break;
    901    }
    902    case 1: {
    903        /* name = prefix factorized-elements */
    904        uint16_t indexes[8];
    905        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
    906        uint16_t count=range->variant;
    907        const char* s = reinterpret_cast<const char*>(factors + count);
    908        char c;
    909 
    910        /* copy prefix */
    911        while((c=*s++)!=0) {
    912            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    913        }
    914 
    915        bufferPos+=writeFactorSuffix(factors, count,
    916                                     s, code-range->start, indexes, nullptr, nullptr, buffer, bufferLength);
    917        break;
    918    }
    919    default:
    920        /* undefined type */
    921        /* zero-terminate */
    922        if(bufferLength>0) {
    923            *buffer=0;
    924        }
    925        break;
    926    }
    927 
    928    return bufferPos;
    929 }
    930 
    931 /*
    932 * Important: enumAlgNames() and findAlgName() are almost the same.
    933 * Any fix must be applied to both.
    934 */
    935 static UBool
    936 enumAlgNames(AlgorithmicRange *range,
    937             UChar32 start, UChar32 limit,
    938             UEnumCharNamesFn *fn, void *context,
    939             UCharNameChoice nameChoice) {
    940    char buffer[200];
    941    uint16_t length;
    942 
    943    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
    944        return true;
    945    }
    946 
    947    switch(range->type) {
    948    case 0: {
    949        char *s, *end;
    950        char c;
    951 
    952        /* get the full name of the start character */
    953        length = getAlgName(range, static_cast<uint32_t>(start), nameChoice, buffer, sizeof(buffer));
    954        if(length<=0) {
    955            return true;
    956        }
    957 
    958        /* call the enumerator function with this first character */
    959        if(!fn(context, start, nameChoice, buffer, length)) {
    960            return false;
    961        }
    962 
    963        /* go to the end of the name; all these names have the same length */
    964        end=buffer;
    965        while(*end!=0) {
    966            ++end;
    967        }
    968 
    969        /* enumerate the rest of the names */
    970        while(++start<limit) {
    971            /* increment the hexadecimal number on a character-basis */
    972            s=end;
    973            for (;;) {
    974                c=*--s;
    975                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
    976                    *s = static_cast<char>(c + 1);
    977                    break;
    978                } else if(c=='9') {
    979                    *s='A';
    980                    break;
    981                } else if(c=='F') {
    982                    *s='0';
    983                }
    984            }
    985 
    986            if(!fn(context, start, nameChoice, buffer, length)) {
    987                return false;
    988            }
    989        }
    990        break;
    991    }
    992    case 1: {
    993        uint16_t indexes[8];
    994        const char *elementBases[8], *elements[8];
    995        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
    996        uint16_t count=range->variant;
    997        const char* s = reinterpret_cast<const char*>(factors + count);
    998        char *suffix, *t;
    999        uint16_t prefixLength, i, idx;
   1000 
   1001        char c;
   1002 
   1003        /* name = prefix factorized-elements */
   1004 
   1005        /* copy prefix */
   1006        suffix=buffer;
   1007        prefixLength=0;
   1008        while((c=*s++)!=0) {
   1009            *suffix++=c;
   1010            ++prefixLength;
   1011        }
   1012 
   1013        /* append the suffix of the start character */
   1014        length = static_cast<uint16_t>(prefixLength + writeFactorSuffix(factors, count,
   1015                                              s, static_cast<uint32_t>(start) - range->start,
   1016                                              indexes, elementBases, elements,
   1017                                              suffix, static_cast<uint16_t>(sizeof(buffer) - prefixLength)));
   1018 
   1019        /* call the enumerator function with this first character */
   1020        if(!fn(context, start, nameChoice, buffer, length)) {
   1021            return false;
   1022        }
   1023 
   1024        /* enumerate the rest of the names */
   1025        while(++start<limit) {
   1026            /* increment the indexes in lexical order bound by the factors */
   1027            i=count;
   1028            for (;;) {
   1029                idx = static_cast<uint16_t>(indexes[--i] + 1);
   1030                if(idx<factors[i]) {
   1031                    /* skip one index and its element string */
   1032                    indexes[i]=idx;
   1033                    s=elements[i];
   1034                    while(*s++!=0) {
   1035                    }
   1036                    elements[i]=s;
   1037                    break;
   1038                } else {
   1039                    /* reset this index to 0 and its element string to the first one */
   1040                    indexes[i]=0;
   1041                    elements[i]=elementBases[i];
   1042                }
   1043            }
   1044 
   1045            /* to make matters a little easier, just append all elements to the suffix */
   1046            t=suffix;
   1047            length=prefixLength;
   1048            for(i=0; i<count; ++i) {
   1049                s=elements[i];
   1050                while((c=*s++)!=0) {
   1051                    *t++=c;
   1052                    ++length;
   1053                }
   1054            }
   1055            /* zero-terminate */
   1056            *t=0;
   1057 
   1058            if(!fn(context, start, nameChoice, buffer, length)) {
   1059                return false;
   1060            }
   1061        }
   1062        break;
   1063    }
   1064    default:
   1065        /* undefined type */
   1066        break;
   1067    }
   1068 
   1069    return true;
   1070 }
   1071 
   1072 /*
   1073 * findAlgName() is almost the same as enumAlgNames() except that it
   1074 * returns the code point for a name if it fits into the range.
   1075 * It returns 0xffff otherwise.
   1076 */
   1077 static UChar32
   1078 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
   1079    UChar32 code;
   1080 
   1081    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
   1082        return 0xffff;
   1083    }
   1084 
   1085    switch(range->type) {
   1086    case 0: {
   1087        /* name = prefix hex-digits */
   1088        const char* s = reinterpret_cast<const char*>(range + 1);
   1089        char c;
   1090 
   1091        uint16_t i, count;
   1092 
   1093        /* compare prefix */
   1094        while((c=*s++)!=0) {
   1095            if (c != *otherName++) {
   1096                return 0xffff;
   1097            }
   1098        }
   1099 
   1100        /* read hexadecimal code point value */
   1101        count=range->variant;
   1102        code=0;
   1103        for(i=0; i<count; ++i) {
   1104            c=*otherName++;
   1105            if('0'<=c && c<='9') {
   1106                code=(code<<4)|(c-'0');
   1107            } else if('A'<=c && c<='F') {
   1108                code=(code<<4)|(c-'A'+10);
   1109            } else {
   1110                return 0xffff;
   1111            }
   1112        }
   1113 
   1114        /* does it fit into the range? */
   1115        if (*otherName == 0 && range->start <= static_cast<uint32_t>(code) && static_cast<uint32_t>(code) <= range->end) {
   1116            return code;
   1117        }
   1118        break;
   1119    }
   1120    case 1: {
   1121        char buffer[64];
   1122        uint16_t indexes[8];
   1123        const char *elementBases[8], *elements[8];
   1124        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
   1125        uint16_t count=range->variant;
   1126        const char *s = reinterpret_cast<const char*>(factors + count), *t;
   1127        UChar32 start, limit;
   1128        uint16_t i, idx;
   1129 
   1130        char c;
   1131 
   1132        /* name = prefix factorized-elements */
   1133 
   1134        /* compare prefix */
   1135        while((c=*s++)!=0) {
   1136            if (c != *otherName++) {
   1137                return 0xffff;
   1138            }
   1139        }
   1140 
   1141        start = static_cast<UChar32>(range->start);
   1142        limit = static_cast<UChar32>(range->end + 1);
   1143 
   1144        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
   1145        writeFactorSuffix(factors, count, s, 0,
   1146                          indexes, elementBases, elements, buffer, sizeof(buffer));
   1147 
   1148        /* compare the first suffix */
   1149        if(0==uprv_strcmp(otherName, buffer)) {
   1150            return start;
   1151        }
   1152 
   1153        /* enumerate and compare the rest of the suffixes */
   1154        while(++start<limit) {
   1155            /* increment the indexes in lexical order bound by the factors */
   1156            i=count;
   1157            for (;;) {
   1158                idx = static_cast<uint16_t>(indexes[--i] + 1);
   1159                if(idx<factors[i]) {
   1160                    /* skip one index and its element string */
   1161                    indexes[i]=idx;
   1162                    s=elements[i];
   1163                    while(*s++!=0) {}
   1164                    elements[i]=s;
   1165                    break;
   1166                } else {
   1167                    /* reset this index to 0 and its element string to the first one */
   1168                    indexes[i]=0;
   1169                    elements[i]=elementBases[i];
   1170                }
   1171            }
   1172 
   1173            /* to make matters a little easier, just compare all elements of the suffix */
   1174            t=otherName;
   1175            for(i=0; i<count; ++i) {
   1176                s=elements[i];
   1177                while((c=*s++)!=0) {
   1178                    if(c!=*t++) {
   1179                        s=""; /* does not match */
   1180                        i=99;
   1181                    }
   1182                }
   1183            }
   1184            if(i<99 && *t==0) {
   1185                return start;
   1186            }
   1187        }
   1188        break;
   1189    }
   1190    default:
   1191        /* undefined type */
   1192        break;
   1193    }
   1194 
   1195    return 0xffff;
   1196 }
   1197 
   1198 /* sets of name characters, maximum name lengths ---------------------------- */
   1199 
   1200 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
   1201 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
   1202 
   1203 static int32_t
   1204 calcStringSetLength(uint32_t set[8], const char *s) {
   1205    int32_t length=0;
   1206    char c;
   1207 
   1208    while((c=*s++)!=0) {
   1209        SET_ADD(set, c);
   1210        ++length;
   1211    }
   1212    return length;
   1213 }
   1214 
   1215 static int32_t
   1216 calcAlgNameSetsLengths(int32_t maxNameLength) {
   1217    AlgorithmicRange *range;
   1218    uint32_t *p;
   1219    uint32_t rangeCount;
   1220    int32_t length;
   1221 
   1222    /* enumerate algorithmic ranges */
   1223    p = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->algNamesOffset);
   1224    rangeCount=*p;
   1225    range = reinterpret_cast<AlgorithmicRange*>(p + 1);
   1226    while(rangeCount>0) {
   1227        switch(range->type) {
   1228        case 0:
   1229            /* name = prefix + (range->variant times) hex-digits */
   1230            /* prefix */
   1231            length = calcStringSetLength(gNameSet, reinterpret_cast<const char*>(range + 1)) + range->variant;
   1232            if(length>maxNameLength) {
   1233                maxNameLength=length;
   1234            }
   1235            break;
   1236        case 1: {
   1237            /* name = prefix factorized-elements */
   1238            const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
   1239            const char *s;
   1240            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
   1241 
   1242            /* prefix length */
   1243            s = reinterpret_cast<const char*>(factors + count);
   1244            length=calcStringSetLength(gNameSet, s);
   1245            s+=length+1; /* start of factor suffixes */
   1246 
   1247            /* get the set and maximum factor suffix length for each factor */
   1248            for(i=0; i<count; ++i) {
   1249                maxFactorLength=0;
   1250                for(factor=factors[i]; factor>0; --factor) {
   1251                    factorLength=calcStringSetLength(gNameSet, s);
   1252                    s+=factorLength+1;
   1253                    if(factorLength>maxFactorLength) {
   1254                        maxFactorLength=factorLength;
   1255                    }
   1256                }
   1257                length+=maxFactorLength;
   1258            }
   1259 
   1260            if(length>maxNameLength) {
   1261                maxNameLength=length;
   1262            }
   1263            break;
   1264        }
   1265        default:
   1266            /* unknown type */
   1267            break;
   1268        }
   1269 
   1270        range = reinterpret_cast<AlgorithmicRange*>(reinterpret_cast<uint8_t*>(range) + range->size);
   1271        --rangeCount;
   1272    }
   1273    return maxNameLength;
   1274 }
   1275 
   1276 static int32_t
   1277 calcExtNameSetsLengths(int32_t maxNameLength) {
   1278    int32_t i, length;
   1279 
   1280    for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
   1281        /*
   1282         * for each category, count the length of the category name
   1283         * plus 9=
   1284         * 2 for <>
   1285         * 1 for -
   1286         * 6 for most hex digits per code point
   1287         */
   1288        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
   1289        if(length>maxNameLength) {
   1290            maxNameLength=length;
   1291        }
   1292    }
   1293    return maxNameLength;
   1294 }
   1295 
   1296 static int32_t
   1297 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
   1298                  uint32_t set[8],
   1299                  const uint8_t **pLine, const uint8_t *lineLimit) {
   1300    const uint8_t *line=*pLine;
   1301    int32_t length=0, tokenLength;
   1302    uint16_t c, token;
   1303 
   1304    while (line != lineLimit && (c = *line++) != static_cast<uint8_t>(';')) {
   1305        if(c>=tokenCount) {
   1306            /* implicit letter */
   1307            SET_ADD(set, c);
   1308            ++length;
   1309        } else {
   1310            token=tokens[c];
   1311            if (token == static_cast<uint16_t>(-2)) {
   1312                /* this is a lead byte for a double-byte token */
   1313                c=c<<8|*line++;
   1314                token=tokens[c];
   1315            }
   1316            if (token == static_cast<uint16_t>(-1)) {
   1317                /* explicit letter */
   1318                SET_ADD(set, c);
   1319                ++length;
   1320            } else {
   1321                /* count token word */
   1322                if(tokenLengths!=nullptr) {
   1323                    /* use cached token length */
   1324                    tokenLength=tokenLengths[c];
   1325                    if(tokenLength==0) {
   1326                        tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
   1327                        tokenLengths[c] = static_cast<int8_t>(tokenLength);
   1328                    }
   1329                } else {
   1330                    tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
   1331                }
   1332                length+=tokenLength;
   1333            }
   1334        }
   1335    }
   1336 
   1337    *pLine=line;
   1338    return length;
   1339 }
   1340 
   1341 static void
   1342 calcGroupNameSetsLengths(int32_t maxNameLength) {
   1343    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
   1344 
   1345    uint16_t* tokens = reinterpret_cast<uint16_t*>(uCharNames) + 8;
   1346    uint16_t tokenCount=*tokens++;
   1347    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->tokenStringOffset;
   1348 
   1349    int8_t *tokenLengths;
   1350 
   1351    const uint16_t *group;
   1352    const uint8_t *s, *line, *lineLimit;
   1353 
   1354    int32_t groupCount, lineNumber, length;
   1355 
   1356    tokenLengths = static_cast<int8_t*>(uprv_malloc(tokenCount));
   1357    if(tokenLengths!=nullptr) {
   1358        uprv_memset(tokenLengths, 0, tokenCount);
   1359    }
   1360 
   1361    group=GET_GROUPS(uCharNames);
   1362    groupCount=*group++;
   1363 
   1364    /* enumerate all groups */
   1365    while(groupCount>0) {
   1366        s = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->groupStringOffset + GET_GROUP_OFFSET(group);
   1367        s=expandGroupLengths(s, offsets, lengths);
   1368 
   1369        /* enumerate all lines in each group */
   1370        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
   1371            line=s+offsets[lineNumber];
   1372            length=lengths[lineNumber];
   1373            if(length==0) {
   1374                continue;
   1375            }
   1376 
   1377            lineLimit=line+length;
   1378 
   1379            /* read regular name */
   1380            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
   1381            if(length>maxNameLength) {
   1382                maxNameLength=length;
   1383            }
   1384            if(line==lineLimit) {
   1385                continue;
   1386            }
   1387 
   1388            /* read Unicode 1.0 name */
   1389            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
   1390            if(length>maxNameLength) {
   1391                maxNameLength=length;
   1392            }
   1393            if(line==lineLimit) {
   1394                continue;
   1395            }
   1396 
   1397            /* read ISO comment */
   1398            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
   1399        }
   1400 
   1401        group=NEXT_GROUP(group);
   1402        --groupCount;
   1403    }
   1404 
   1405    if(tokenLengths!=nullptr) {
   1406        uprv_free(tokenLengths);
   1407    }
   1408 
   1409    /* set gMax... - name length last for threading */
   1410    gMaxNameLength=maxNameLength;
   1411 }
   1412 
   1413 static UBool
   1414 calcNameSetsLengths(UErrorCode *pErrorCode) {
   1415    static const char extChars[]="0123456789ABCDEF<>-";
   1416    int32_t i, maxNameLength;
   1417 
   1418    if(gMaxNameLength!=0) {
   1419        return true;
   1420    }
   1421 
   1422    if(!isDataLoaded(pErrorCode)) {
   1423        return false;
   1424    }
   1425 
   1426    /* set hex digits, used in various names, and <>-, used in extended names */
   1427    for (i = 0; i < static_cast<int32_t>(sizeof(extChars)) - 1; ++i) {
   1428        SET_ADD(gNameSet, extChars[i]);
   1429    }
   1430 
   1431    /* set sets and lengths from algorithmic names */
   1432    maxNameLength=calcAlgNameSetsLengths(0);
   1433 
   1434    /* set sets and lengths from extended names */
   1435    maxNameLength=calcExtNameSetsLengths(maxNameLength);
   1436 
   1437    /* set sets and lengths from group names, set global maximum values */
   1438    calcGroupNameSetsLengths(maxNameLength);
   1439 
   1440    return true;
   1441 }
   1442 
   1443 U_NAMESPACE_END
   1444 
   1445 /* public API --------------------------------------------------------------- */
   1446 
   1447 U_NAMESPACE_USE
   1448 
   1449 U_CAPI int32_t U_EXPORT2
   1450 u_charName(UChar32 code, UCharNameChoice nameChoice,
   1451           char *buffer, int32_t bufferLength,
   1452           UErrorCode *pErrorCode) {
   1453     AlgorithmicRange *algRange;
   1454    uint32_t *p;
   1455    uint32_t i;
   1456    int32_t length;
   1457 
   1458    /* check the argument values */
   1459    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1460        return 0;
   1461    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
   1462              bufferLength<0 || (bufferLength>0 && buffer==nullptr)
   1463    ) {
   1464        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1465        return 0;
   1466    }
   1467 
   1468    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
   1469        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
   1470    }
   1471 
   1472    length=0;
   1473 
   1474    /* try algorithmic names first */
   1475    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1476    i=*p;
   1477    algRange=(AlgorithmicRange *)(p+1);
   1478    while(i>0) {
   1479        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
   1480            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
   1481            break;
   1482        }
   1483        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1484        --i;
   1485    }
   1486 
   1487    if(i==0) {
   1488        if (nameChoice == U_EXTENDED_CHAR_NAME) {
   1489            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
   1490            if (!length) {
   1491                /* extended character name */
   1492                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
   1493            }
   1494        } else {
   1495            /* normal character name */
   1496            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
   1497        }
   1498    }
   1499 
   1500    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
   1501 }
   1502 
   1503 U_CAPI int32_t U_EXPORT2
   1504 u_getISOComment(UChar32 /*c*/,
   1505                char *dest, int32_t destCapacity,
   1506                UErrorCode *pErrorCode) {
   1507    /* check the argument values */
   1508    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1509        return 0;
   1510    } else if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
   1511        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1512        return 0;
   1513    }
   1514 
   1515    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
   1516 }
   1517 
   1518 U_CAPI UChar32 U_EXPORT2
   1519 u_charFromName(UCharNameChoice nameChoice,
   1520               const char *name,
   1521               UErrorCode *pErrorCode) {
   1522    char upper[120] = {0};
   1523    char lower[120] = {0};
   1524    FindName findName;
   1525    AlgorithmicRange *algRange;
   1526    uint32_t *p;
   1527    uint32_t i;
   1528    UChar32 cp = 0;
   1529    char c0;
   1530    static constexpr UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
   1531 
   1532    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1533        return error;
   1534    }
   1535 
   1536    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==nullptr || *name==0) {
   1537        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1538        return error;
   1539    }
   1540 
   1541    if(!isDataLoaded(pErrorCode)) {
   1542        return error;
   1543    }
   1544 
   1545    /* construct the uppercase and lowercase of the name first */
   1546    for(i=0; i<sizeof(upper); ++i) {
   1547        if((c0=*name++)!=0) {
   1548            upper[i]=uprv_toupper(c0);
   1549            lower[i]=uprv_tolower(c0);
   1550        } else {
   1551            upper[i]=lower[i]=0;
   1552            break;
   1553        }
   1554    }
   1555    if(i==sizeof(upper)) {
   1556        /* name too long, there is no such character */
   1557        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1558        return error;
   1559    }
   1560    // i==strlen(name)==strlen(lower)==strlen(upper)
   1561 
   1562    /* try extended names first */
   1563    if (lower[0] == '<') {
   1564        if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') {
   1565            // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
   1566            uint32_t limit = i;
   1567            while (i >= 3 && lower[--i] != '-') {}
   1568 
   1569            // There should be 1 to 8 hex digits.
   1570            int32_t hexLength = limit - (i + 1);
   1571            if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) {
   1572                uint32_t cIdx;
   1573 
   1574                lower[i] = 0;
   1575 
   1576                for (++i; i < limit; ++i) {
   1577                    if (lower[i] >= '0' && lower[i] <= '9') {
   1578                        cp = (cp << 4) + lower[i] - '0';
   1579                    } else if (lower[i] >= 'a' && lower[i] <= 'f') {
   1580                        cp = (cp << 4) + lower[i] - 'a' + 10;
   1581                    } else {
   1582                        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1583                        return error;
   1584                    }
   1585                    // Prevent signed-integer overflow and out-of-range code points.
   1586                    if (cp > UCHAR_MAX_VALUE) {
   1587                        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1588                        return error;
   1589                    }
   1590                }
   1591 
   1592                /* Now validate the category name.
   1593                   We could use a binary search, or a trie, if
   1594                   we really wanted to. */
   1595                uint8_t cat = getCharCat(cp);
   1596                for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
   1597 
   1598                    if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
   1599                        if (cat == cIdx) {
   1600                            return cp;
   1601                        }
   1602                        break;
   1603                    }
   1604                }
   1605            }
   1606        }
   1607 
   1608        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1609        return error;
   1610    }
   1611 
   1612    /* try algorithmic names now */
   1613    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1614    i=*p;
   1615    algRange=(AlgorithmicRange *)(p+1);
   1616    while(i>0) {
   1617        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
   1618            return cp;
   1619        }
   1620        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1621        --i;
   1622    }
   1623 
   1624    /* normal character name */
   1625    findName.otherName=upper;
   1626    findName.code=error;
   1627    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
   1628    if (findName.code == error) {
   1629         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1630    }
   1631    return findName.code;
   1632 }
   1633 
   1634 U_CAPI void U_EXPORT2
   1635 u_enumCharNames(UChar32 start, UChar32 limit,
   1636                UEnumCharNamesFn *fn,
   1637                void *context,
   1638                UCharNameChoice nameChoice,
   1639                UErrorCode *pErrorCode) {
   1640    AlgorithmicRange *algRange;
   1641    uint32_t *p;
   1642    uint32_t i;
   1643 
   1644    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1645        return;
   1646    }
   1647 
   1648    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==nullptr) {
   1649        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1650        return;
   1651    }
   1652 
   1653    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
   1654        limit = UCHAR_MAX_VALUE + 1;
   1655    }
   1656    if((uint32_t)start>=(uint32_t)limit) {
   1657        return;
   1658    }
   1659 
   1660    if(!isDataLoaded(pErrorCode)) {
   1661        return;
   1662    }
   1663 
   1664    /* interleave the data-driven ones with the algorithmic ones */
   1665    /* iterate over all algorithmic ranges; assume that they are in ascending order */
   1666    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1667    i=*p;
   1668    algRange=(AlgorithmicRange *)(p+1);
   1669    while(i>0) {
   1670        /* enumerate the character names before the current algorithmic range */
   1671        /* here: start<limit */
   1672        if((uint32_t)start<algRange->start) {
   1673            if((uint32_t)limit<=algRange->start) {
   1674                enumNames(uCharNames, start, limit, fn, context, nameChoice);
   1675                return;
   1676            }
   1677            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
   1678                return;
   1679            }
   1680            start=(UChar32)algRange->start;
   1681        }
   1682        /* enumerate the character names in the current algorithmic range */
   1683        /* here: algRange->start<=start<limit */
   1684        if((uint32_t)start<=algRange->end) {
   1685            if((uint32_t)limit<=(algRange->end+1)) {
   1686                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
   1687                return;
   1688            }
   1689            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
   1690                return;
   1691            }
   1692            start=(UChar32)algRange->end+1;
   1693        }
   1694        /* continue to the next algorithmic range (here: start<limit) */
   1695        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1696        --i;
   1697    }
   1698    /* enumerate the character names after the last algorithmic range */
   1699    enumNames(uCharNames, start, limit, fn, context, nameChoice);
   1700 }
   1701 
   1702 U_CAPI int32_t U_EXPORT2
   1703 uprv_getMaxCharNameLength() {
   1704    UErrorCode errorCode=U_ZERO_ERROR;
   1705    if(calcNameSetsLengths(&errorCode)) {
   1706        return gMaxNameLength;
   1707    } else {
   1708        return 0;
   1709    }
   1710 }
   1711 
   1712 /**
   1713 * Converts the char set cset into a Unicode set uset.
   1714 * @param cset Set of 256 bit flags corresponding to a set of chars.
   1715 * @param uset USet to receive characters. Existing contents are deleted.
   1716 */
   1717 static void
   1718 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
   1719    char16_t us[256];
   1720    char cs[256];
   1721 
   1722    int32_t i, length;
   1723    UErrorCode errorCode;
   1724 
   1725    errorCode=U_ZERO_ERROR;
   1726 
   1727    if(!calcNameSetsLengths(&errorCode)) {
   1728        return;
   1729    }
   1730 
   1731    /* build a char string with all chars that are used in character names */
   1732    length=0;
   1733    for(i=0; i<256; ++i) {
   1734        if(SET_CONTAINS(cset, i)) {
   1735            cs[length++] = static_cast<char>(i);
   1736        }
   1737    }
   1738 
   1739    /* convert the char string to a char16_t string */
   1740    u_charsToUChars(cs, us, length);
   1741 
   1742    /* add each char16_t to the USet */
   1743    for(i=0; i<length; ++i) {
   1744        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (char16_t)0 */
   1745            sa->add(sa->set, us[i]);
   1746        }
   1747    }
   1748 }
   1749 
   1750 /**
   1751 * Fills set with characters that are used in Unicode character names.
   1752 * @param set USet to receive characters.
   1753 */
   1754 U_CAPI void U_EXPORT2
   1755 uprv_getCharNameCharacters(const USetAdder *sa) {
   1756    charSetToUSet(gNameSet, sa);
   1757 }
   1758 
   1759 /* data swapping ------------------------------------------------------------ */
   1760 
   1761 /*
   1762 * The token table contains non-negative entries for token bytes,
   1763 * and -1 for bytes that represent themselves in the data file's charset.
   1764 * -2 entries are used for lead bytes.
   1765 *
   1766 * Direct bytes (-1 entries) must be translated from the input charset family
   1767 * to the output charset family.
   1768 * makeTokenMap() writes a permutation mapping for this.
   1769 * Use it once for single-/lead-byte tokens and once more for all trail byte
   1770 * tokens. (';' is an unused trail byte marked with -1.)
   1771 */
   1772 static void
   1773 makeTokenMap(const UDataSwapper *ds,
   1774             int16_t tokens[], uint16_t tokenCount,
   1775             uint8_t map[256],
   1776             UErrorCode *pErrorCode) {
   1777    UBool usedOutChar[256];
   1778    uint16_t i, j;
   1779    uint8_t c1, c2;
   1780 
   1781    if(U_FAILURE(*pErrorCode)) {
   1782        return;
   1783    }
   1784 
   1785    if(ds->inCharset==ds->outCharset) {
   1786        /* Same charset family: identity permutation */
   1787        for(i=0; i<256; ++i) {
   1788            map[i] = static_cast<uint8_t>(i);
   1789        }
   1790    } else {
   1791        uprv_memset(map, 0, 256);
   1792        uprv_memset(usedOutChar, 0, 256);
   1793 
   1794        if(tokenCount>256) {
   1795            tokenCount=256;
   1796        }
   1797 
   1798        /* set the direct bytes (byte 0 always maps to itself) */
   1799        for(i=1; i<tokenCount; ++i) {
   1800            if(tokens[i]==-1) {
   1801                /* convert the direct byte character */
   1802                c1 = static_cast<uint8_t>(i);
   1803                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
   1804                if(U_FAILURE(*pErrorCode)) {
   1805                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
   1806                                     i, ds->inCharset);
   1807                    return;
   1808                }
   1809 
   1810                /* enter the converted character into the map and mark it used */
   1811                map[c1]=c2;
   1812                usedOutChar[c2]=true;
   1813            }
   1814        }
   1815 
   1816        /* set the mappings for the rest of the permutation */
   1817        for(i=j=1; i<tokenCount; ++i) {
   1818            /* set mappings that were not set for direct bytes */
   1819            if(map[i]==0) {
   1820                /* set an output byte value that was not used as an output byte above */
   1821                while(usedOutChar[j]) {
   1822                    ++j;
   1823                }
   1824                map[i] = static_cast<uint8_t>(j++);
   1825            }
   1826        }
   1827 
   1828        /*
   1829         * leave mappings at tokenCount and above unset if tokenCount<256
   1830         * because they won't be used
   1831         */
   1832    }
   1833 }
   1834 
   1835 U_CAPI int32_t U_EXPORT2
   1836 uchar_swapNames(const UDataSwapper *ds,
   1837                const void *inData, int32_t length, void *outData,
   1838                UErrorCode *pErrorCode) {
   1839    const UDataInfo *pInfo;
   1840    int32_t headerSize;
   1841 
   1842    const uint8_t *inBytes;
   1843    uint8_t *outBytes;
   1844 
   1845    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
   1846             offset, i, count, stringsCount;
   1847 
   1848    const AlgorithmicRange *inRange;
   1849    AlgorithmicRange *outRange;
   1850 
   1851    /* udata_swapDataHeader checks the arguments */
   1852    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
   1853    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
   1854        return 0;
   1855    }
   1856 
   1857    /* check data format and format version */
   1858    pInfo=(const UDataInfo *)((const char *)inData+4);
   1859    if(!(
   1860        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
   1861        pInfo->dataFormat[1]==0x6e &&
   1862        pInfo->dataFormat[2]==0x61 &&
   1863        pInfo->dataFormat[3]==0x6d &&
   1864        pInfo->formatVersion[0]==1
   1865    )) {
   1866        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
   1867                         pInfo->dataFormat[0], pInfo->dataFormat[1],
   1868                         pInfo->dataFormat[2], pInfo->dataFormat[3],
   1869                         pInfo->formatVersion[0]);
   1870        *pErrorCode=U_UNSUPPORTED_ERROR;
   1871        return 0;
   1872    }
   1873 
   1874    inBytes=(const uint8_t *)inData+headerSize;
   1875    outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;
   1876    if(length<0) {
   1877        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
   1878    } else {
   1879        length-=headerSize;
   1880        if( length<20 ||
   1881            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
   1882        ) {
   1883            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
   1884                             length);
   1885            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1886            return 0;
   1887        }
   1888    }
   1889 
   1890    if(length<0) {
   1891        /* preflighting: iterate through algorithmic ranges */
   1892        offset=algNamesOffset;
   1893        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
   1894        offset+=4;
   1895 
   1896        for(i=0; i<count; ++i) {
   1897            inRange=(const AlgorithmicRange *)(inBytes+offset);
   1898            offset+=ds->readUInt16(inRange->size);
   1899        }
   1900    } else {
   1901        /* swap data */
   1902        const uint16_t *p;
   1903        uint16_t *q, *temp;
   1904 
   1905        int16_t tokens[512];
   1906        uint16_t tokenCount;
   1907 
   1908        uint8_t map[256], trailMap[256];
   1909 
   1910        /* copy the data for inaccessible bytes */
   1911        if(inBytes!=outBytes) {
   1912            uprv_memcpy(outBytes, inBytes, length);
   1913        }
   1914 
   1915        /* the initial 4 offsets first */
   1916        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
   1917        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
   1918        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
   1919        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
   1920 
   1921        /*
   1922         * now the tokens table
   1923         * it needs to be permutated along with the compressed name strings
   1924         */
   1925        p=(const uint16_t *)(inBytes+16);
   1926        q=(uint16_t *)(outBytes+16);
   1927 
   1928        /* read and swap the tokenCount */
   1929        tokenCount=ds->readUInt16(*p);
   1930        ds->swapArray16(ds, p, 2, q, pErrorCode);
   1931        ++p;
   1932        ++q;
   1933 
   1934        /* read the first 512 tokens and make the token maps */
   1935        if(tokenCount<=512) {
   1936            count=tokenCount;
   1937        } else {
   1938            count=512;
   1939        }
   1940        for(i=0; i<count; ++i) {
   1941            tokens[i]=udata_readInt16(ds, p[i]);
   1942        }
   1943        for(; i<512; ++i) {
   1944            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
   1945        }
   1946        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
   1947        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
   1948        if(U_FAILURE(*pErrorCode)) {
   1949            return 0;
   1950        }
   1951 
   1952        /*
   1953         * swap and permutate the tokens
   1954         * go through a temporary array to support in-place swapping
   1955         */
   1956        temp=(uint16_t *)uprv_malloc(tokenCount*2);
   1957        if(temp==nullptr) {
   1958            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
   1959                             tokenCount);
   1960            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1961            return 0;
   1962        }
   1963 
   1964        /* swap and permutate single-/lead-byte tokens */
   1965        for(i=0; i<tokenCount && i<256; ++i) {
   1966            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
   1967        }
   1968 
   1969        /* swap and permutate trail-byte tokens */
   1970        for(; i<tokenCount; ++i) {
   1971            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
   1972        }
   1973 
   1974        /* copy the result into the output and free the temporary array */
   1975        uprv_memcpy(q, temp, tokenCount*2);
   1976        uprv_free(temp);
   1977 
   1978        /*
   1979         * swap the token strings but not a possible padding byte after
   1980         * the terminating NUL of the last string
   1981         */
   1982        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
   1983                                    outBytes+tokenStringOffset, pErrorCode);
   1984        if(U_FAILURE(*pErrorCode)) {
   1985            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
   1986            return 0;
   1987        }
   1988 
   1989        /* swap the group table */
   1990        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
   1991        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
   1992                           outBytes+groupsOffset, pErrorCode);
   1993 
   1994        /*
   1995         * swap the group strings
   1996         * swap the string bytes but not the nibble-encoded string lengths
   1997         */
   1998        if(ds->inCharset!=ds->outCharset) {
   1999            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
   2000 
   2001            const uint8_t *inStrings, *nextInStrings;
   2002            uint8_t *outStrings;
   2003 
   2004            uint8_t c;
   2005 
   2006            inStrings=inBytes+groupStringOffset;
   2007            outStrings=outBytes+groupStringOffset;
   2008 
   2009            stringsCount=algNamesOffset-groupStringOffset;
   2010 
   2011            /* iterate through string groups until only a few padding bytes are left */
   2012            while(stringsCount>32) {
   2013                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
   2014 
   2015                /* move past the length bytes */
   2016                stringsCount-=(uint32_t)(nextInStrings-inStrings);
   2017                outStrings+=nextInStrings-inStrings;
   2018                inStrings=nextInStrings;
   2019 
   2020                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
   2021                stringsCount-=count;
   2022 
   2023                /* swap the string bytes using map[] and trailMap[] */
   2024                while(count>0) {
   2025                    c=*inStrings++;
   2026                    *outStrings++=map[c];
   2027                    if(tokens[c]!=-2) {
   2028                        --count;
   2029                    } else {
   2030                        /* token lead byte: swap the trail byte, too */
   2031                        *outStrings++=trailMap[*inStrings++];
   2032                        count-=2;
   2033                    }
   2034                }
   2035            }
   2036        }
   2037 
   2038        /* swap the algorithmic ranges */
   2039        offset=algNamesOffset;
   2040        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
   2041        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
   2042        offset+=4;
   2043 
   2044        for(i=0; i<count; ++i) {
   2045            if(offset>(uint32_t)length) {
   2046                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
   2047                                 length, i);
   2048                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2049                return 0;
   2050            }
   2051 
   2052            inRange=(const AlgorithmicRange *)(inBytes+offset);
   2053            outRange=(AlgorithmicRange *)(outBytes+offset);
   2054            offset+=ds->readUInt16(inRange->size);
   2055 
   2056            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
   2057            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
   2058            switch(inRange->type) {
   2059            case 0:
   2060                /* swap prefix string */
   2061                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
   2062                                    outRange+1, pErrorCode);
   2063                if(U_FAILURE(*pErrorCode)) {
   2064                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
   2065                                     i);
   2066                    return 0;
   2067                }
   2068                break;
   2069            case 1:
   2070                {
   2071                    /* swap factors and the prefix and factor strings */
   2072                    uint32_t factorsCount;
   2073 
   2074                    factorsCount=inRange->variant;
   2075                    p=(const uint16_t *)(inRange+1);
   2076                    q=(uint16_t *)(outRange+1);
   2077                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
   2078 
   2079                    /* swap the strings, up to the last terminating NUL */
   2080                    p+=factorsCount;
   2081                    q+=factorsCount;
   2082                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
   2083                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
   2084                        --stringsCount;
   2085                    }
   2086                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
   2087                }
   2088                break;
   2089            default:
   2090                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
   2091                                 inRange->type, i);
   2092                *pErrorCode=U_UNSUPPORTED_ERROR;
   2093                return 0;
   2094            }
   2095        }
   2096    }
   2097 
   2098    return headerSize+(int32_t)offset;
   2099 }
   2100 
   2101 /*
   2102 * Hey, Emacs, please set the following:
   2103 *
   2104 * Local Variables:
   2105 * indent-tabs-mode: nil
   2106 * End:
   2107 *
   2108 */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE