tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucm.cpp (35721B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2003-2013, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucm.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2003jun20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file reads a .ucm file, stores its mappings and sorts them.
     19 *   It implements handling of Unicode conversion mappings from .ucm files
     20 *   for makeconv, canonucm, rptp2ucm, etc.
     21 *
     22 *   Unicode code point sequences with a length of more than 1,
     23 *   as well as byte sequences with more than 4 bytes or more than one complete
     24 *   character sequence are handled to support m:n mappings.
     25 */
     26 
     27 #include "unicode/utypes.h"
     28 #include "unicode/ustring.h"
     29 #include "cstring.h"
     30 #include "cmemory.h"
     31 #include "filestrm.h"
     32 #include "uarrsort.h"
     33 #include "ucnvmbcs.h"
     34 #include "ucnv_bld.h"
     35 #include "ucnv_ext.h"
     36 #include "uparse.h"
     37 #include "ucm.h"
     38 #include <stdio.h>
     39 
     40 #if !UCONFIG_NO_CONVERSION
     41 
     42 /* -------------------------------------------------------------------------- */
     43 
     44 static void
     45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
     46    int32_t j;
     47 
     48    for(j=0; j<m->uLen; ++j) {
     49        fprintf(f, "<U%04lX>", static_cast<long>(codePoints[j]));
     50    }
     51 
     52    fputc(' ', f);
     53 
     54    for(j=0; j<m->bLen; ++j) {
     55        fprintf(f, "\\x%02X", bytes[j]);
     56    }
     57 
     58    if(m->f>=0) {
     59        fprintf(f, " |%u\n", m->f);
     60    } else {
     61        fputs("\n", f);
     62    }
     63 }
     64 
     65 U_CAPI void U_EXPORT2
     66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
     67    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
     68 }
     69 
     70 U_CAPI void U_EXPORT2
     71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
     72    UCMapping *m;
     73    int32_t i, length;
     74 
     75    m=table->mappings;
     76    length=table->mappingsLength;
     77    if(byUnicode) {
     78        for(i=0; i<length; ++m, ++i) {
     79            ucm_printMapping(table, m, f);
     80        }
     81    } else {
     82        const int32_t *map=table->reverseMap;
     83        for(i=0; i<length; ++i) {
     84            ucm_printMapping(table, m+map[i], f);
     85        }
     86    }
     87 }
     88 
     89 /* mapping comparisons ------------------------------------------------------ */
     90 
     91 static int32_t
     92 compareUnicode(UCMTable *lTable, const UCMapping *l,
     93               UCMTable *rTable, const UCMapping *r) {
     94    const UChar32 *lu, *ru;
     95    int32_t result, i, length;
     96 
     97    if(l->uLen==1 && r->uLen==1) {
     98        /* compare two single code points */
     99        return l->u-r->u;
    100    }
    101 
    102    /* get pointers to the code point sequences */
    103    lu=UCM_GET_CODE_POINTS(lTable, l);
    104    ru=UCM_GET_CODE_POINTS(rTable, r);
    105 
    106    /* get the minimum length */
    107    if(l->uLen<=r->uLen) {
    108        length=l->uLen;
    109    } else {
    110        length=r->uLen;
    111    }
    112 
    113    /* compare the code points */
    114    for(i=0; i<length; ++i) {
    115        result=lu[i]-ru[i];
    116        if(result!=0) {
    117            return result;
    118        }
    119    }
    120 
    121    /* compare the lengths */
    122    return l->uLen-r->uLen;
    123 }
    124 
    125 static int32_t
    126 compareBytes(UCMTable *lTable, const UCMapping *l,
    127             UCMTable *rTable, const UCMapping *r,
    128             UBool lexical) {
    129    const uint8_t *lb, *rb;
    130    int32_t result, i, length;
    131 
    132    /*
    133     * A lexical comparison is used for sorting in the builder, to allow
    134     * an efficient search for a byte sequence that could be a prefix
    135     * of a previously entered byte sequence.
    136     *
    137     * Comparing by lengths first is for compatibility with old .ucm tools
    138     * like canonucm and rptp2ucm.
    139     */
    140    if(lexical) {
    141        /* get the minimum length and continue */
    142        if(l->bLen<=r->bLen) {
    143            length=l->bLen;
    144        } else {
    145            length=r->bLen;
    146        }
    147    } else {
    148        /* compare lengths first */
    149        result=l->bLen-r->bLen;
    150        if(result!=0) {
    151            return result;
    152        } else {
    153            length=l->bLen;
    154        }
    155    }
    156 
    157    /* get pointers to the byte sequences */
    158    lb=UCM_GET_BYTES(lTable, l);
    159    rb=UCM_GET_BYTES(rTable, r);
    160 
    161    /* compare the bytes */
    162    for(i=0; i<length; ++i) {
    163        result=lb[i]-rb[i];
    164        if(result!=0) {
    165            return result;
    166        }
    167    }
    168 
    169    /* compare the lengths */
    170    return l->bLen-r->bLen;
    171 }
    172 
    173 /* compare UCMappings for sorting */
    174 static int32_t
    175 compareMappings(UCMTable *lTable, const UCMapping *l,
    176                UCMTable *rTable, const UCMapping *r,
    177                UBool uFirst) {
    178    int32_t result;
    179 
    180    /* choose which side to compare first */
    181    if(uFirst) {
    182        /* Unicode then bytes */
    183        result=compareUnicode(lTable, l, rTable, r);
    184        if(result==0) {
    185            result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */
    186        }
    187    } else {
    188        /* bytes then Unicode */
    189        result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */
    190        if(result==0) {
    191            result=compareUnicode(lTable, l, rTable, r);
    192        }
    193    }
    194 
    195    if(result!=0) {
    196        return result;
    197    }
    198 
    199    /* compare the flags */
    200    return l->f-r->f;
    201 }
    202 U_CDECL_BEGIN
    203 /* sorting by Unicode first sorts mappings directly */
    204 static int32_t  U_CALLCONV
    205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
    206    return compareMappings(
    207        (UCMTable *)context, (const UCMapping *)left,
    208        (UCMTable *)context, (const UCMapping *)right, true);
    209 }
    210 
    211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
    212 static int32_t U_CALLCONV
    213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
    214    UCMTable *table=(UCMTable *)context;
    215    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
    216    return compareMappings(
    217        table, table->mappings+l,
    218        table, table->mappings+r, false);
    219 }
    220 U_CDECL_END
    221 
    222 U_CAPI void U_EXPORT2
    223 ucm_sortTable(UCMTable *t) {
    224    UErrorCode errorCode;
    225    int32_t i;
    226 
    227    if(t->isSorted) {
    228        return;
    229    }
    230 
    231    errorCode=U_ZERO_ERROR;
    232 
    233    /* 1. sort by Unicode first */
    234    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
    235                   compareMappingsUnicodeFirst, t,
    236                   false, &errorCode);
    237 
    238    /* build the reverseMap */
    239    if(t->reverseMap==nullptr) {
    240        /*
    241         * allocate mappingsCapacity instead of mappingsLength so that
    242         * if mappings are added, the reverseMap need not be
    243         * reallocated each time
    244         * (see ucm_moveMappings() and ucm_addMapping())
    245         */
    246        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
    247        if(t->reverseMap==nullptr) {
    248            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
    249            exit(U_MEMORY_ALLOCATION_ERROR);
    250        }
    251    }
    252    for(i=0; i<t->mappingsLength; ++i) {
    253        t->reverseMap[i]=i;
    254    }
    255 
    256    /* 2. sort reverseMap by mappings bytes first */
    257    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
    258                   compareMappingsBytesFirst, t,
    259                   false, &errorCode);
    260 
    261    if(U_FAILURE(errorCode)) {
    262        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
    263                u_errorName(errorCode));
    264        exit(errorCode);
    265    }
    266 
    267    t->isSorted=true;
    268 }
    269 
    270 /*
    271 * remove mappings with their move flag set from the base table
    272 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
    273 */
    274 U_CAPI void U_EXPORT2
    275 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
    276    UCMapping *mb, *mbLimit;
    277    int8_t flag;
    278 
    279    mb=base->mappings;
    280    mbLimit=mb+base->mappingsLength;
    281 
    282    while(mb<mbLimit) {
    283        flag=mb->moveFlag;
    284        if(flag!=0) {
    285            /* reset the move flag */
    286            mb->moveFlag=0;
    287 
    288            if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) {
    289                /* add the mapping to the extension table */
    290                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
    291            }
    292 
    293            /* remove this mapping: move the last base mapping down and overwrite the current one */
    294            if(mb<(mbLimit-1)) {
    295                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
    296            }
    297            --mbLimit;
    298            --base->mappingsLength;
    299            base->isSorted=false;
    300        } else {
    301            ++mb;
    302        }
    303    }
    304 }
    305 
    306 enum {
    307    NEEDS_MOVE=1,
    308    HAS_ERRORS=2
    309 };
    310 
    311 static uint8_t
    312 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    313                    UBool moveToExt, int8_t intersectBase) {
    314    (void)baseStates;
    315 
    316    UCMapping *mb, *me, *mbLimit, *meLimit;
    317    int32_t cmp;
    318    uint8_t result;
    319 
    320    mb=base->mappings;
    321    mbLimit=mb+base->mappingsLength;
    322 
    323    me=ext->mappings;
    324    meLimit=me+ext->mappingsLength;
    325 
    326    result=0;
    327 
    328    for(;;) {
    329        /* skip irrelevant mappings on both sides */
    330        for(;;) {
    331            if(mb==mbLimit) {
    332                return result;
    333            }
    334 
    335            if((0<=mb->f && mb->f<=2) || mb->f==4) {
    336                break;
    337            }
    338 
    339            ++mb;
    340        }
    341 
    342        for(;;) {
    343            if(me==meLimit) {
    344                return result;
    345            }
    346 
    347            if((0<=me->f && me->f<=2) || me->f==4) {
    348                break;
    349            }
    350 
    351            ++me;
    352        }
    353 
    354        /* compare the base and extension mappings */
    355        cmp=compareUnicode(base, mb, ext, me);
    356        if(cmp<0) {
    357            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
    358                /*
    359                 * mapping in base but not in ext, move it
    360                 *
    361                 * if ext is DBCS, move DBCS mappings here
    362                 * and check SBCS ones for Unicode prefix below
    363                 */
    364                mb->moveFlag|=UCM_MOVE_TO_EXT;
    365                result|=NEEDS_MOVE;
    366 
    367            /* does mb map from an input sequence that is a prefix of me's? */
    368            } else if( mb->uLen<me->uLen &&
    369                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    370            ) {
    371                if(moveToExt) {
    372                    /* mark this mapping to be moved to the extension table */
    373                    mb->moveFlag|=UCM_MOVE_TO_EXT;
    374                    result|=NEEDS_MOVE;
    375                } else {
    376                    fprintf(stderr,
    377                            "ucm error: the base table contains a mapping whose input sequence\n"
    378                            "           is a prefix of the input sequence of an extension mapping\n");
    379                    ucm_printMapping(base, mb, stderr);
    380                    ucm_printMapping(ext, me, stderr);
    381                    result|=HAS_ERRORS;
    382                }
    383            }
    384 
    385            ++mb;
    386        } else if(cmp==0) {
    387            /*
    388             * same output: remove the extension mapping,
    389             * otherwise treat as an error
    390             */
    391            if( mb->f==me->f && mb->bLen==me->bLen &&
    392                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    393            ) {
    394                me->moveFlag|=UCM_REMOVE_MAPPING;
    395                result|=NEEDS_MOVE;
    396            } else if(intersectBase) {
    397                /* mapping in base but not in ext, move it */
    398                mb->moveFlag|=UCM_MOVE_TO_EXT;
    399                result|=NEEDS_MOVE;
    400            } else {
    401                fprintf(stderr,
    402                        "ucm error: the base table contains a mapping whose input sequence\n"
    403                        "           is the same as the input sequence of an extension mapping\n"
    404                        "           but it maps differently\n");
    405                ucm_printMapping(base, mb, stderr);
    406                ucm_printMapping(ext, me, stderr);
    407                result|=HAS_ERRORS;
    408            }
    409 
    410            ++mb;
    411        } else /* cmp>0 */ {
    412            ++me;
    413        }
    414    }
    415 }
    416 
    417 static uint8_t
    418 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
    419                  UBool moveToExt, int8_t intersectBase) {
    420    UCMapping *mb, *me;
    421    int32_t *baseMap, *extMap;
    422    int32_t b, e, bLimit, eLimit, cmp;
    423    uint8_t result;
    424    UBool isSISO;
    425 
    426    baseMap=base->reverseMap;
    427    extMap=ext->reverseMap;
    428 
    429    b=e=0;
    430    bLimit=base->mappingsLength;
    431    eLimit=ext->mappingsLength;
    432 
    433    result=0;
    434 
    435    isSISO = static_cast<UBool>(baseStates->outputType == MBCS_OUTPUT_2_SISO);
    436 
    437    for(;;) {
    438        /* skip irrelevant mappings on both sides */
    439        for(;; ++b) {
    440            if(b==bLimit) {
    441                return result;
    442            }
    443            mb=base->mappings+baseMap[b];
    444 
    445            if(intersectBase==2 && mb->bLen==1) {
    446                /*
    447                 * comparing a base against a DBCS extension:
    448                 * leave SBCS base mappings alone
    449                 */
    450                continue;
    451            }
    452 
    453            if(mb->f==0 || mb->f==3) {
    454                break;
    455            }
    456        }
    457 
    458        for(;;) {
    459            if(e==eLimit) {
    460                return result;
    461            }
    462            me=ext->mappings+extMap[e];
    463 
    464            if(me->f==0 || me->f==3) {
    465                break;
    466            }
    467 
    468            ++e;
    469        }
    470 
    471        /* compare the base and extension mappings */
    472        cmp=compareBytes(base, mb, ext, me, true);
    473        if(cmp<0) {
    474            if(intersectBase) {
    475                /* mapping in base but not in ext, move it */
    476                mb->moveFlag|=UCM_MOVE_TO_EXT;
    477                result|=NEEDS_MOVE;
    478 
    479            /*
    480             * does mb map from an input sequence that is a prefix of me's?
    481             * for SI/SO tables, a single byte is never a prefix because it
    482             * occurs in a separate single-byte state
    483             */
    484            } else if( mb->bLen<me->bLen &&
    485                (!isSISO || mb->bLen>1) &&
    486                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
    487            ) {
    488                if(moveToExt) {
    489                    /* mark this mapping to be moved to the extension table */
    490                    mb->moveFlag|=UCM_MOVE_TO_EXT;
    491                    result|=NEEDS_MOVE;
    492                } else {
    493                    fprintf(stderr,
    494                            "ucm error: the base table contains a mapping whose input sequence\n"
    495                            "           is a prefix of the input sequence of an extension mapping\n");
    496                    ucm_printMapping(base, mb, stderr);
    497                    ucm_printMapping(ext, me, stderr);
    498                    result|=HAS_ERRORS;
    499                }
    500            }
    501 
    502            ++b;
    503        } else if(cmp==0) {
    504            /*
    505             * same output: remove the extension mapping,
    506             * otherwise treat as an error
    507             */
    508            if( mb->f==me->f && mb->uLen==me->uLen &&
    509                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
    510            ) {
    511                me->moveFlag|=UCM_REMOVE_MAPPING;
    512                result|=NEEDS_MOVE;
    513            } else if(intersectBase) {
    514                /* mapping in base but not in ext, move it */
    515                mb->moveFlag|=UCM_MOVE_TO_EXT;
    516                result|=NEEDS_MOVE;
    517            } else {
    518                fprintf(stderr,
    519                        "ucm error: the base table contains a mapping whose input sequence\n"
    520                        "           is the same as the input sequence of an extension mapping\n"
    521                        "           but it maps differently\n");
    522                ucm_printMapping(base, mb, stderr);
    523                ucm_printMapping(ext, me, stderr);
    524                result|=HAS_ERRORS;
    525            }
    526 
    527            ++b;
    528        } else /* cmp>0 */ {
    529            ++e;
    530        }
    531    }
    532 }
    533 
    534 U_CAPI UBool U_EXPORT2
    535 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    536    UCMapping *m, *mLimit;
    537    int32_t count;
    538    UBool isOK;
    539 
    540    m=table->mappings;
    541    mLimit=m+table->mappingsLength;
    542    isOK=true;
    543 
    544    while(m<mLimit) {
    545        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
    546        if(count<1) {
    547            ucm_printMapping(table, m, stderr);
    548            isOK=false;
    549        }
    550        ++m;
    551    }
    552 
    553    return isOK;
    554 }
    555 
    556 U_CAPI UBool U_EXPORT2
    557 ucm_checkBaseExt(UCMStates *baseStates,
    558                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
    559                 int8_t intersectBase) {
    560    uint8_t result;
    561 
    562    /* if we have an extension table, we must always use precision flags */
    563    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
    564        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
    565        return false;
    566    }
    567    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
    568        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
    569        return false;
    570    }
    571 
    572    /* checking requires both tables to be sorted */
    573    ucm_sortTable(base);
    574    ucm_sortTable(ext);
    575 
    576    /* check */
    577    result=
    578        checkBaseExtUnicode(baseStates, base, ext, moveTarget != nullptr, intersectBase) |
    579        checkBaseExtBytes(baseStates, base, ext, moveTarget != nullptr, intersectBase);
    580 
    581    if(result&HAS_ERRORS) {
    582        return false;
    583    }
    584 
    585    if(result&NEEDS_MOVE) {
    586        ucm_moveMappings(ext, nullptr);
    587        ucm_moveMappings(base, moveTarget);
    588        ucm_sortTable(base);
    589        ucm_sortTable(ext);
    590        if(moveTarget!=nullptr) {
    591            ucm_sortTable(moveTarget);
    592        }
    593    }
    594 
    595    return true;
    596 }
    597 
    598 /* merge tables for rptp2ucm ------------------------------------------------ */
    599 
    600 U_CAPI void U_EXPORT2
    601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
    602                const uint8_t *subchar, int32_t subcharLength,
    603                uint8_t subchar1) {
    604    UCMapping *fromUMapping, *toUMapping;
    605    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
    606 
    607    ucm_sortTable(fromUTable);
    608    ucm_sortTable(toUTable);
    609 
    610    fromUMapping=fromUTable->mappings;
    611    toUMapping=toUTable->mappings;
    612 
    613    fromUTop=fromUTable->mappingsLength;
    614    toUTop=toUTable->mappingsLength;
    615 
    616    fromUIndex=toUIndex=0;
    617 
    618    while(fromUIndex<fromUTop && toUIndex<toUTop) {
    619        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true);
    620        if(cmp==0) {
    621            /* equal: roundtrip, nothing to do (flags are initially 0) */
    622            ++fromUMapping;
    623            ++toUMapping;
    624 
    625            ++fromUIndex;
    626            ++toUIndex;
    627        } else if(cmp<0) {
    628            /*
    629             * the fromU mapping does not have a toU counterpart:
    630             * fallback Unicode->codepage
    631             */
    632            if( (fromUMapping->bLen==subcharLength &&
    633                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    634                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    635            ) {
    636                fromUMapping->f=2; /* SUB mapping */
    637            } else {
    638                fromUMapping->f=1; /* normal fallback */
    639            }
    640 
    641            ++fromUMapping;
    642            ++fromUIndex;
    643        } else {
    644            /*
    645             * the toU mapping does not have a fromU counterpart:
    646             * (reverse) fallback codepage->Unicode, copy it to the fromU table
    647             */
    648 
    649            /* ignore reverse fallbacks to Unicode SUB */
    650            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    651                toUMapping->f=3; /* reverse fallback */
    652                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    653 
    654                /* the table may have been reallocated */
    655                fromUMapping=fromUTable->mappings+fromUIndex;
    656            }
    657 
    658            ++toUMapping;
    659            ++toUIndex;
    660        }
    661    }
    662 
    663    /* either one or both tables are exhausted */
    664    while(fromUIndex<fromUTop) {
    665        /* leftover fromU mappings are fallbacks */
    666        if( (fromUMapping->bLen==subcharLength &&
    667             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
    668            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
    669        ) {
    670            fromUMapping->f=2; /* SUB mapping */
    671        } else {
    672            fromUMapping->f=1; /* normal fallback */
    673        }
    674 
    675        ++fromUMapping;
    676        ++fromUIndex;
    677    }
    678 
    679    while(toUIndex<toUTop) {
    680        /* leftover toU mappings are reverse fallbacks */
    681 
    682        /* ignore reverse fallbacks to Unicode SUB */
    683        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
    684            toUMapping->f=3; /* reverse fallback */
    685            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
    686        }
    687 
    688        ++toUMapping;
    689        ++toUIndex;
    690    }
    691 
    692    fromUTable->isSorted=false;
    693 }
    694 
    695 /* separate extension mappings out of base table for rptp2ucm --------------- */
    696 
    697 U_CAPI UBool U_EXPORT2
    698 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
    699    UCMTable *table;
    700    UCMapping *m, *mLimit;
    701    int32_t type;
    702    UBool needsMove, isOK;
    703 
    704    table=ucm->base;
    705    m=table->mappings;
    706    mLimit=m+table->mappingsLength;
    707 
    708    needsMove=false;
    709    isOK=true;
    710 
    711    for(; m<mLimit; ++m) {
    712        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
    713            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
    714            ucm_printMapping(table, m, stderr);
    715            m->moveFlag|=UCM_REMOVE_MAPPING;
    716            needsMove=true;
    717            continue;
    718        }
    719 
    720        type=ucm_mappingType(
    721                &ucm->states, m,
    722                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
    723        if(type<0) {
    724            /* illegal byte sequence */
    725            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
    726            isOK=false;
    727        } else if(type>0) {
    728            m->moveFlag|=UCM_MOVE_TO_EXT;
    729            needsMove=true;
    730        }
    731    }
    732 
    733    if(!isOK) {
    734        return false;
    735    }
    736    if(needsMove) {
    737        ucm_moveMappings(ucm->base, ucm->ext);
    738        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, 0);
    739    } else {
    740        ucm_sortTable(ucm->base);
    741        return true;
    742    }
    743 }
    744 
    745 /* ucm parser --------------------------------------------------------------- */
    746 
    747 U_CAPI int8_t U_EXPORT2
    748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
    749    const char *s=*ps;
    750    char *end;
    751    uint8_t byte;
    752    int8_t bLen;
    753 
    754    bLen=0;
    755    for(;;) {
    756        /* skip an optional plus sign */
    757        if(bLen>0 && *s=='+') {
    758            ++s;
    759        }
    760        if(*s!='\\') {
    761            break;
    762        }
    763 
    764        if( s[1]!='x' ||
    765            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
    766        ) {
    767            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
    768            return -1;
    769        }
    770 
    771        if(bLen==UCNV_EXT_MAX_BYTES) {
    772            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
    773            return -1;
    774        }
    775        bytes[bLen++]=byte;
    776        s=end;
    777    }
    778 
    779    *ps=s;
    780    return bLen;
    781 }
    782 
    783 /* parse a mapping line; must not be empty */
    784 U_CAPI UBool U_EXPORT2
    785 ucm_parseMappingLine(UCMapping *m,
    786                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    787                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
    788                     const char *line) {
    789    const char *s;
    790    char *end;
    791    UChar32 cp;
    792    int32_t u16Length;
    793    int8_t uLen, bLen, f;
    794 
    795    s=line;
    796    uLen=bLen=0;
    797 
    798    /* parse code points */
    799    for(;;) {
    800        /* skip an optional plus sign */
    801        if(uLen>0 && *s=='+') {
    802            ++s;
    803        }
    804        if(*s!='<') {
    805            break;
    806        }
    807 
    808        if( s[1]!='U' ||
    809            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
    810            *end!='>'
    811        ) {
    812            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
    813            return false;
    814        }
    815        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
    816            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
    817            return false;
    818        }
    819 
    820        if(uLen==UCNV_EXT_MAX_UCHARS) {
    821            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
    822            return false;
    823        }
    824        codePoints[uLen++]=cp;
    825        s=end+1;
    826    }
    827 
    828    if(uLen==0) {
    829        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
    830        return false;
    831    } else if(uLen==1) {
    832        m->u=codePoints[0];
    833    } else {
    834        UErrorCode errorCode=U_ZERO_ERROR;
    835        u_strFromUTF32(nullptr, 0, &u16Length, codePoints, uLen, &errorCode);
    836        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
    837            u16Length>UCNV_EXT_MAX_UCHARS
    838        ) {
    839            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
    840            return false;
    841        }
    842    }
    843 
    844    s=u_skipWhitespace(s);
    845 
    846    /* parse bytes */
    847    bLen=ucm_parseBytes(bytes, line, &s);
    848 
    849    if(bLen<0) {
    850        return false;
    851    } else if(bLen==0) {
    852        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
    853        return false;
    854    } else if(bLen<=4) {
    855        uprv_memcpy(m->b.bytes, bytes, bLen);
    856    }
    857 
    858    /* skip everything until the fallback indicator, even the start of a comment */
    859    for(;;) {
    860        if(*s==0) {
    861            f=-1; /* no fallback indicator */
    862            break;
    863        } else if(*s=='|') {
    864            f=(int8_t)(s[1]-'0');
    865            if((uint8_t)f>4) {
    866                fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
    867                return false;
    868            }
    869            break;
    870        }
    871        ++s;
    872    }
    873 
    874    m->uLen=uLen;
    875    m->bLen=bLen;
    876    m->f=f;
    877    return true;
    878 }
    879 
    880 /* general APIs ------------------------------------------------------------- */
    881 
    882 U_CAPI UCMTable * U_EXPORT2
    883 ucm_openTable() {
    884    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
    885    if(table==nullptr) {
    886        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
    887        exit(U_MEMORY_ALLOCATION_ERROR);
    888    }
    889 
    890    memset(table, 0, sizeof(UCMTable));
    891    return table;
    892 }
    893 
    894 U_CAPI void U_EXPORT2
    895 ucm_closeTable(UCMTable *table) {
    896    if(table!=nullptr) {
    897        uprv_free(table->mappings);
    898        uprv_free(table->codePoints);
    899        uprv_free(table->bytes);
    900        uprv_free(table->reverseMap);
    901        uprv_free(table);
    902    }
    903 }
    904 
    905 U_CAPI void U_EXPORT2
    906 ucm_resetTable(UCMTable *table) {
    907    if(table!=nullptr) {
    908        table->mappingsLength=0;
    909        table->flagsType=0;
    910        table->unicodeMask=0;
    911        table->bytesLength=table->codePointsLength=0;
    912        table->isSorted=false;
    913    }
    914 }
    915 
    916 U_CAPI void U_EXPORT2
    917 ucm_addMapping(UCMTable *table,
    918               UCMapping *m,
    919               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
    920               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    921    UCMapping *tm;
    922    UChar32 c;
    923    int32_t idx;
    924 
    925    if(table->mappingsLength>=table->mappingsCapacity) {
    926        /* make the mappings array larger */
    927        if(table->mappingsCapacity==0) {
    928            table->mappingsCapacity=1000;
    929        } else {
    930            table->mappingsCapacity*=10;
    931        }
    932        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
    933                                             table->mappingsCapacity*sizeof(UCMapping));
    934        if(table->mappings==nullptr) {
    935            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
    936                            (int)table->mappingsCapacity);
    937            exit(U_MEMORY_ALLOCATION_ERROR);
    938        }
    939 
    940        if(table->reverseMap!=nullptr) {
    941            /* the reverseMap must be reallocated in a new sort */
    942            uprv_free(table->reverseMap);
    943            table->reverseMap=nullptr;
    944        }
    945    }
    946 
    947    if(m->uLen>1 && table->codePointsCapacity==0) {
    948        table->codePointsCapacity=10000;
    949        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
    950        if(table->codePoints==nullptr) {
    951            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
    952                            (int)table->codePointsCapacity);
    953            exit(U_MEMORY_ALLOCATION_ERROR);
    954        }
    955    }
    956 
    957    if(m->bLen>4 && table->bytesCapacity==0) {
    958        table->bytesCapacity=10000;
    959        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
    960        if(table->bytes==nullptr) {
    961            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
    962                            (int)table->bytesCapacity);
    963            exit(U_MEMORY_ALLOCATION_ERROR);
    964        }
    965    }
    966 
    967    if(m->uLen>1) {
    968        idx=table->codePointsLength;
    969        table->codePointsLength+=m->uLen;
    970        if(table->codePointsLength>table->codePointsCapacity) {
    971            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
    972            exit(U_MEMORY_ALLOCATION_ERROR);
    973        }
    974 
    975        uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
    976        m->u=idx;
    977    }
    978 
    979    if(m->bLen>4) {
    980        idx=table->bytesLength;
    981        table->bytesLength+=m->bLen;
    982        if(table->bytesLength>table->bytesCapacity) {
    983            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
    984            exit(U_MEMORY_ALLOCATION_ERROR);
    985        }
    986 
    987        uprv_memcpy(table->bytes+idx, bytes, m->bLen);
    988        m->b.idx=idx;
    989    }
    990 
    991    /* set unicodeMask */
    992    for(idx=0; idx<m->uLen; ++idx) {
    993        c=codePoints[idx];
    994        if(c>=0x10000) {
    995            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
    996        } else if(U_IS_SURROGATE(c)) {
    997            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
    998        }
    999    }
   1000 
   1001    /* set flagsType */
   1002    if(m->f<0) {
   1003        table->flagsType|=UCM_FLAGS_IMPLICIT;
   1004    } else {
   1005        table->flagsType|=UCM_FLAGS_EXPLICIT;
   1006    }
   1007 
   1008    tm=table->mappings+table->mappingsLength++;
   1009    uprv_memcpy(tm, m, sizeof(UCMapping));
   1010 
   1011    table->isSorted=false;
   1012 }
   1013 
   1014 U_CAPI UCMFile * U_EXPORT2
   1015 ucm_open() {
   1016    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
   1017    if(ucm==nullptr) {
   1018        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
   1019        exit(U_MEMORY_ALLOCATION_ERROR);
   1020    }
   1021 
   1022    memset(ucm, 0, sizeof(UCMFile));
   1023 
   1024    ucm->base=ucm_openTable();
   1025    ucm->ext=ucm_openTable();
   1026 
   1027    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
   1028    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
   1029    ucm->states.outputType=-1;
   1030    ucm->states.minCharLength=ucm->states.maxCharLength=1;
   1031 
   1032    return ucm;
   1033 }
   1034 
   1035 U_CAPI void U_EXPORT2
   1036 ucm_close(UCMFile *ucm) {
   1037    if(ucm!=nullptr) {
   1038        ucm_closeTable(ucm->base);
   1039        ucm_closeTable(ucm->ext);
   1040        uprv_free(ucm);
   1041    }
   1042 }
   1043 
   1044 U_CAPI int32_t U_EXPORT2
   1045 ucm_mappingType(UCMStates *baseStates,
   1046                UCMapping *m,
   1047                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1048                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1049    (void)codePoints;
   1050    /* check validity of the bytes and count the characters in them */
   1051    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
   1052    if(count<1) {
   1053        /* illegal byte sequence */
   1054        return -1;
   1055    }
   1056 
   1057    /*
   1058     * Suitable for an ICU conversion base table means:
   1059     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
   1060     * - precision flag 0..3
   1061     * - SBCS: any 1:1 mapping
   1062     *         (the table stores additional bits to distinguish mapping types)
   1063     * - MBCS: not a |2 SUB mapping for <subchar1>
   1064     * - MBCS: not a |1 fallback to 0x00
   1065     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
   1066     *
   1067     * Further restrictions for fromUnicode tables
   1068     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
   1069     *
   1070     * All of the MBCS fromUnicode specific tests could be removed from here,
   1071     * but the ones above are for unusual mappings, and removing the tests
   1072     * from here would change canonucm output which seems gratuitous.
   1073     * (Markus Scherer 2006-nov-28)
   1074     *
   1075     * Exception: All implicit mappings (f<0) that need to be moved
   1076     * because of fromUnicode restrictions _must_ be moved here because
   1077     * makeconv uses a hack for moving mappings only for the fromUnicode table
   1078     * that only works with non-negative values of f.
   1079     */
   1080    if( m->uLen==1 && count==1 && m->f<=3 &&
   1081        (baseStates->maxCharLength==1 ||
   1082            !((m->f==2 && m->bLen==1) ||
   1083              (m->f==1 && bytes[0]==0) ||
   1084              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
   1085    ) {
   1086        return 0; /* suitable for a base table */
   1087    } else {
   1088        return 1; /* needs to go into an extension table */
   1089    }
   1090 }
   1091 
   1092 U_CAPI UBool U_EXPORT2
   1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
   1094                   UCMapping *m,
   1095                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
   1096                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
   1097    int32_t type;
   1098 
   1099    if(m->f==2 && m->uLen>1) {
   1100        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
   1101        printMapping(m, codePoints, bytes, stderr);
   1102        return false;
   1103    }
   1104 
   1105    if(baseStates!=nullptr) {
   1106        /* check validity of the bytes and count the characters in them */
   1107        type=ucm_mappingType(baseStates, m, codePoints, bytes);
   1108        if(type<0) {
   1109            /* illegal byte sequence */
   1110            printMapping(m, codePoints, bytes, stderr);
   1111            return false;
   1112        }
   1113    } else {
   1114        /* not used - adding a mapping for an extension-only table before its base table is read */
   1115        type=1;
   1116    }
   1117 
   1118    /*
   1119     * Add the mapping to the base table if this is requested and suitable.
   1120     * Otherwise, add it to the extension table.
   1121     */
   1122    if(forBase && type==0) {
   1123        ucm_addMapping(ucm->base, m, codePoints, bytes);
   1124    } else {
   1125        ucm_addMapping(ucm->ext, m, codePoints, bytes);
   1126    }
   1127 
   1128    return true;
   1129 }
   1130 
   1131 U_CAPI UBool U_EXPORT2
   1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
   1133  UCMapping m={ 0, {0}, 0, 0, 0, 0 };
   1134    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
   1135    uint8_t bytes[UCNV_EXT_MAX_BYTES];
   1136 
   1137    const char *s;
   1138 
   1139    /* ignore empty and comment lines */
   1140    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
   1141        return true;
   1142    }
   1143 
   1144    return
   1145        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
   1146        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
   1147 }
   1148 
   1149 U_CAPI void U_EXPORT2
   1150 ucm_readTable(UCMFile *ucm, FileStream* convFile,
   1151              UBool forBase, UCMStates *baseStates,
   1152              UErrorCode *pErrorCode) {
   1153    char line[500];
   1154    char *end;
   1155    UBool isOK;
   1156 
   1157    if(U_FAILURE(*pErrorCode)) {
   1158        return;
   1159    }
   1160 
   1161    isOK=true;
   1162 
   1163    for(;;) {
   1164        /* read the next line */
   1165        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
   1166            fprintf(stderr, "incomplete charmap section\n");
   1167            isOK=false;
   1168            break;
   1169        }
   1170 
   1171        /* remove CR LF */
   1172        end=uprv_strchr(line, 0);
   1173        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
   1174            --end;
   1175        }
   1176        *end=0;
   1177 
   1178        /* ignore empty and comment lines */
   1179        if(line[0]==0 || line[0]=='#') {
   1180            continue;
   1181        }
   1182 
   1183        /* stop at the end of the mapping table */
   1184        if(0==uprv_strcmp(line, "END CHARMAP")) {
   1185            break;
   1186        }
   1187 
   1188        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
   1189    }
   1190 
   1191    if(!isOK) {
   1192        *pErrorCode=U_INVALID_TABLE_FORMAT;
   1193    }
   1194 }
   1195 #endif