tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucmstate.cpp (39786B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2003-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucmstate.c
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2003oct09
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This file handles ICU .ucm file state information as part of the ucm module.
     19 *   Most of this code used to be in makeconv.c.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "cstring.h"
     24 #include "cmemory.h"
     25 #include "uarrsort.h"
     26 #include "ucnvmbcs.h"
     27 #include "ucnv_ext.h"
     28 #include "uparse.h"
     29 #include "ucm.h"
     30 #include <stdio.h>
     31 
     32 #if !UCONFIG_NO_CONVERSION
     33 
     34 /* MBCS state handling ------------------------------------------------------ */
     35 
     36 /*
     37 * state table row grammar (ebnf-style):
     38 * (whitespace is allowed between all tokens)
     39 *
     40 * row=[[firstentry ','] entry (',' entry)*]
     41 * firstentry="initial" | "surrogates"
     42 *            (initial state (default for state 0), output is all surrogate pairs)
     43 * entry=range [':' nextstate] ['.' action]
     44 * range=number ['-' number]
     45 * nextstate=number
     46 *           (0..7f)
     47 * action='u' | 's' | 'p' | 'i'
     48 *        (unassigned, state change only, surrogate pair, illegal)
     49 * number=(1- or 2-digit hexadecimal number)
     50 */
     51 static const char *
     52 parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
     53    const char *t;
     54    uint32_t start, end, i;
     55    int32_t entry;
     56 
     57    /* initialize the state: all illegal with U+ffff */
     58    for(i=0; i<256; ++i) {
     59        state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
     60    }
     61 
     62    /* skip leading white space */
     63    s=u_skipWhitespace(s);
     64 
     65    /* is there an "initial" or "surrogates" directive? */
     66    if(uprv_strncmp("initial", s, 7)==0) {
     67        *pFlags=MBCS_STATE_FLAG_DIRECT;
     68        s=u_skipWhitespace(s+7);
     69        if(*s++!=',') {
     70            return s-1;
     71        }
     72    } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
     73        *pFlags=MBCS_STATE_FLAG_SURROGATES;
     74        s=u_skipWhitespace(s+10);
     75        if(*s++!=',') {
     76            return s-1;
     77        }
     78    } else if(*s==0) {
     79        /* empty state row: all-illegal */
     80        return nullptr;
     81    }
     82 
     83    for(;;) {
     84        /* read an entry, the start of the range first */
     85        s=u_skipWhitespace(s);
     86        start=uprv_strtoul(s, (char **)&t, 16);
     87        if(s==t || 0xff<start) {
     88            return s;
     89        }
     90        s=u_skipWhitespace(t);
     91 
     92        /* read the end of the range if there is one */
     93        if(*s=='-') {
     94            s=u_skipWhitespace(s+1);
     95            end=uprv_strtoul(s, (char **)&t, 16);
     96            if(s==t || end<start || 0xff<end) {
     97                return s;
     98            }
     99            s=u_skipWhitespace(t);
    100        } else {
    101            end=start;
    102        }
    103 
    104        /* determine the state entry for this range */
    105        if(*s!=':' && *s!='.') {
    106            /* the default is: final state with valid entries */
    107            entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
    108        } else {
    109            entry=MBCS_ENTRY_TRANSITION(0, 0);
    110            if(*s==':') {
    111                /* get the next state, default to 0 */
    112                s=u_skipWhitespace(s+1);
    113                i=uprv_strtoul(s, (char **)&t, 16);
    114                if(s!=t) {
    115                    if(0x7f<i) {
    116                        return s;
    117                    }
    118                    s=u_skipWhitespace(t);
    119                    entry=MBCS_ENTRY_SET_STATE(entry, i);
    120                }
    121            }
    122 
    123            /* get the state action, default to valid */
    124            if(*s=='.') {
    125                /* this is a final state */
    126                entry=MBCS_ENTRY_SET_FINAL(entry);
    127 
    128                s=u_skipWhitespace(s+1);
    129                if(*s=='u') {
    130                    /* unassigned set U+fffe */
    131                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
    132                    s=u_skipWhitespace(s+1);
    133                } else if(*s=='p') {
    134                    if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
    135                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
    136                    } else {
    137                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
    138                    }
    139                    s=u_skipWhitespace(s+1);
    140                } else if(*s=='s') {
    141                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
    142                    s=u_skipWhitespace(s+1);
    143                } else if(*s=='i') {
    144                    /* illegal set U+ffff */
    145                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
    146                    s=u_skipWhitespace(s+1);
    147                } else {
    148                    /* default to valid */
    149                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
    150                }
    151            } else {
    152                /* this is an intermediate state, nothing to do */
    153            }
    154        }
    155 
    156        /* adjust "final valid" states according to the state flags */
    157        if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
    158            switch(*pFlags) {
    159            case 0:
    160                /* no adjustment */
    161                break;
    162            case MBCS_STATE_FLAG_DIRECT:
    163                /* set the valid-direct code point to "unassigned"==0xfffe */
    164                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
    165                break;
    166            case MBCS_STATE_FLAG_SURROGATES:
    167                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
    168                break;
    169            default:
    170                break;
    171            }
    172        }
    173 
    174        /* set this entry for the range */
    175        for(i=start; i<=end; ++i) {
    176            state[i]=entry;
    177        }
    178 
    179        if(*s==',') {
    180            ++s;
    181        } else {
    182            return *s==0 ? nullptr : s;
    183        }
    184    }
    185 }
    186 
    187 U_CAPI void U_EXPORT2
    188 ucm_addState(UCMStates *states, const char *s) {
    189    const char *error;
    190 
    191    if(states->countStates==MBCS_MAX_STATE_COUNT) {
    192        fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
    193        exit(U_INVALID_TABLE_FORMAT);
    194    }
    195 
    196    error=parseState(s, states->stateTable[states->countStates],
    197                       &states->stateFlags[states->countStates]);
    198    if(error!=nullptr) {
    199        fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
    200        exit(U_INVALID_TABLE_FORMAT);
    201    }
    202 
    203    ++states->countStates;
    204 }
    205 
    206 U_CAPI UBool U_EXPORT2
    207 ucm_parseHeaderLine(UCMFile *ucm,
    208                    char *line, char **pKey, char **pValue) {
    209    UCMStates *states;
    210    char *s, *end;
    211    char c;
    212 
    213    states=&ucm->states;
    214 
    215    /* remove comments and trailing CR and LF and remove whitespace from the end */
    216    for(end=line; (c=*end)!=0; ++end) {
    217        if(c=='#' || c=='\r' || c=='\n') {
    218            break;
    219        }
    220    }
    221    while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
    222        --end;
    223    }
    224    *end=0;
    225 
    226    /* skip leading white space and ignore empty lines */
    227    s=(char *)u_skipWhitespace(line);
    228    if(*s==0) {
    229        return true;
    230    }
    231 
    232    /* stop at the beginning of the mapping section */
    233    if(uprv_memcmp(s, "CHARMAP", 7)==0) {
    234        return false;
    235    }
    236 
    237    /* get the key name, bracketed in <> */
    238    if(*s!='<') {
    239        fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
    240        exit(U_INVALID_TABLE_FORMAT);
    241    }
    242    *pKey=++s;
    243    while(*s!='>') {
    244        if(*s==0) {
    245            fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
    246            exit(U_INVALID_TABLE_FORMAT);
    247        }
    248        ++s;
    249    }
    250    *s=0;
    251 
    252    /* get the value string, possibly quoted */
    253    s=(char *)u_skipWhitespace(s+1);
    254    if(*s!='"') {
    255        *pValue=s;
    256    } else {
    257        /* remove the quotes */
    258        *pValue=s+1;
    259        if(end>*pValue && *(end-1)=='"') {
    260            *--end=0;
    261        }
    262    }
    263 
    264    /* collect the information from the header field, ignore unknown keys */
    265    if(uprv_strcmp(*pKey, "uconv_class")==0) {
    266        if(uprv_strcmp(*pValue, "DBCS")==0) {
    267            states->conversionType=UCNV_DBCS;
    268        } else if(uprv_strcmp(*pValue, "SBCS")==0) {
    269            states->conversionType = UCNV_SBCS;
    270        } else if(uprv_strcmp(*pValue, "MBCS")==0) {
    271            states->conversionType = UCNV_MBCS;
    272        } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
    273            states->conversionType = UCNV_EBCDIC_STATEFUL;
    274        } else {
    275            fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
    276            exit(U_INVALID_TABLE_FORMAT);
    277        }
    278        return true;
    279    } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
    280        c=**pValue;
    281        if('1'<=c && c<='4' && (*pValue)[1]==0) {
    282            states->maxCharLength=(int8_t)(c-'0');
    283            states->outputType=(int8_t)(states->maxCharLength-1);
    284        } else {
    285            fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
    286            exit(U_INVALID_TABLE_FORMAT);
    287        }
    288        return true;
    289    } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
    290        c=**pValue;
    291        if('1'<=c && c<='4' && (*pValue)[1]==0) {
    292            states->minCharLength=(int8_t)(c-'0');
    293        } else {
    294            fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
    295            exit(U_INVALID_TABLE_FORMAT);
    296        }
    297        return true;
    298    } else if(uprv_strcmp(*pKey, "icu:state")==0) {
    299        /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
    300        switch(states->conversionType) {
    301        case UCNV_SBCS:
    302        case UCNV_DBCS:
    303        case UCNV_EBCDIC_STATEFUL:
    304            states->conversionType=UCNV_MBCS;
    305            break;
    306        case UCNV_MBCS:
    307            break;
    308        default:
    309            fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
    310            exit(U_INVALID_TABLE_FORMAT);
    311        }
    312 
    313        if(states->maxCharLength==0) {
    314            fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
    315            exit(U_INVALID_TABLE_FORMAT);
    316        }
    317        ucm_addState(states, *pValue);
    318        return true;
    319    } else if(uprv_strcmp(*pKey, "icu:base")==0) {
    320        if(**pValue==0) {
    321            fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
    322            exit(U_INVALID_TABLE_FORMAT);
    323        }
    324        uprv_strcpy(ucm->baseName, *pValue);
    325        return true;
    326    }
    327 
    328    return false;
    329 }
    330 
    331 /* post-processing ---------------------------------------------------------- */
    332 
    333 static int32_t
    334 sumUpStates(UCMStates *states) {
    335    int32_t entry, sum, state, cell, count;
    336    UBool allStatesReady;
    337 
    338    /*
    339     * Sum up the offsets for all states.
    340     * In each final state (where there are only final entries),
    341     * the offsets add up directly.
    342     * In all other state table rows, for each transition entry to another state,
    343     * the offsets sum of that state needs to be added.
    344     * This is achieved in at most countStates iterations.
    345     */
    346    allStatesReady=false;
    347    for(count=states->countStates; !allStatesReady && count>=0; --count) {
    348        allStatesReady=true;
    349        for(state=states->countStates-1; state>=0; --state) {
    350            if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
    351                allStatesReady=false;
    352                sum=0;
    353 
    354                /* at first, add up only the final delta offsets to keep them <512 */
    355                for(cell=0; cell<256; ++cell) {
    356                    entry=states->stateTable[state][cell];
    357                    if(MBCS_ENTRY_IS_FINAL(entry)) {
    358                        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    359                        case MBCS_STATE_VALID_16:
    360                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
    361                            sum+=1;
    362                            break;
    363                        case MBCS_STATE_VALID_16_PAIR:
    364                            states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
    365                            sum+=2;
    366                            break;
    367                        default:
    368                            /* no addition */
    369                            break;
    370                        }
    371                    }
    372                }
    373 
    374                /* now, add up the delta offsets for the transitional entries */
    375                for(cell=0; cell<256; ++cell) {
    376                    entry=states->stateTable[state][cell];
    377                    if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    378                        if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
    379                            states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
    380                            sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
    381                        } else {
    382                            /* that next state does not have a sum yet, we cannot finish the one for this state */
    383                            sum=-1;
    384                            break;
    385                        }
    386                    }
    387                }
    388 
    389                if(sum!=-1) {
    390                    states->stateOffsetSum[state]=sum;
    391                    states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
    392                }
    393            }
    394        }
    395    }
    396 
    397    if(!allStatesReady) {
    398        fprintf(stderr, "ucm error: the state table contains loops\n");
    399        exit(U_INVALID_TABLE_FORMAT);
    400    }
    401 
    402    /*
    403     * For all "direct" (i.e., initial) states>0,
    404     * the offsets need to be increased by the sum of
    405     * the previous initial states.
    406     */
    407    sum=states->stateOffsetSum[0];
    408    for(state=1; state<states->countStates; ++state) {
    409        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    410            int32_t sum2=sum;
    411            sum+=states->stateOffsetSum[state];
    412            for(cell=0; cell<256; ++cell) {
    413                entry=states->stateTable[state][cell];
    414                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    415                    states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
    416                }
    417            }
    418        }
    419    }
    420 
    421    /* round up to the next even number to have the following data 32-bit-aligned */
    422    return states->countToUCodeUnits=(sum+1)&~1;
    423 }
    424 
    425 U_CAPI void U_EXPORT2
    426 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
    427    int32_t entry, state, cell, count;
    428 
    429    if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
    430        fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
    431        exit(U_INVALID_TABLE_FORMAT);
    432    }
    433 
    434    if(states->countStates==0) {
    435        switch(states->conversionType) {
    436        case UCNV_SBCS:
    437            /* SBCS: use MBCS data structure with a default state table */
    438            if(states->maxCharLength!=1) {
    439                fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
    440                exit(U_INVALID_TABLE_FORMAT);
    441            }
    442            states->conversionType=UCNV_MBCS;
    443            ucm_addState(states, "0-ff");
    444            break;
    445        case UCNV_MBCS:
    446            fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
    447            exit(U_INVALID_TABLE_FORMAT);
    448            break;
    449        case UCNV_EBCDIC_STATEFUL:
    450            /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
    451            if(states->minCharLength!=1 || states->maxCharLength!=2) {
    452                fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
    453                exit(U_INVALID_TABLE_FORMAT);
    454            }
    455            states->conversionType=UCNV_MBCS;
    456            ucm_addState(states, "0-ff, e:1.s, f:0.s");
    457            ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
    458            ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
    459            ucm_addState(states, "0-ff:1.i, 40:1.");
    460            ucm_addState(states, "0-ff:1.i");
    461            break;
    462        case UCNV_DBCS:
    463            /* DBCS: use MBCS data structure with a default state table */
    464            if(states->minCharLength!=2 || states->maxCharLength!=2) {
    465                fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
    466                exit(U_INVALID_TABLE_FORMAT);
    467            }
    468            states->conversionType = UCNV_MBCS;
    469            ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
    470            ucm_addState(states, "41-fe");
    471            ucm_addState(states, "40");
    472            ucm_addState(states, "");
    473            break;
    474        default:
    475            fprintf(stderr, "ucm error: unknown charset structure\n");
    476            exit(U_INVALID_TABLE_FORMAT);
    477            break;
    478        }
    479    }
    480 
    481    /*
    482     * check that the min/max character lengths are reasonable;
    483     * to do this right, all paths through the state table would have to be
    484     * recursively walked while keeping track of the sequence lengths,
    485     * but these simple checks cover most state tables in practice
    486     */
    487    if(states->maxCharLength<states->minCharLength) {
    488        fprintf(stderr, "ucm error: max B/char < min B/char\n");
    489        exit(U_INVALID_TABLE_FORMAT);
    490    }
    491 
    492    /* count non-direct states and compare with max B/char */
    493    count=0;
    494    for(state=0; state<states->countStates; ++state) {
    495        if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
    496            ++count;
    497        }
    498    }
    499    if(states->maxCharLength>count+1) {
    500        fprintf(stderr, "ucm error: max B/char too large\n");
    501        exit(U_INVALID_TABLE_FORMAT);
    502    }
    503 
    504    if(states->minCharLength==1) {
    505        int32_t action;
    506 
    507        /*
    508         * if there are single-byte characters,
    509         * then the initial state must have direct result states
    510         */
    511        for(cell=0; cell<256; ++cell) {
    512            entry=states->stateTable[0][cell];
    513            if( MBCS_ENTRY_IS_FINAL(entry) &&
    514                ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
    515                 action==MBCS_STATE_UNASSIGNED)
    516            ) {
    517                break;
    518            }
    519        }
    520 
    521        if(cell==256) {
    522            fprintf(stderr, "ucm warning: min B/char too small\n");
    523        }
    524    }
    525 
    526    /*
    527     * make sure that all "next state" values are within limits
    528     * and that all next states after final ones have the "direct"
    529     * flag of initial states
    530     */
    531    for(state=states->countStates-1; state>=0; --state) {
    532        for(cell=0; cell<256; ++cell) {
    533            entry=states->stateTable[state][cell];
    534            if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
    535                fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
    536                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
    537                exit(U_INVALID_TABLE_FORMAT);
    538            }
    539            if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
    540                fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
    541                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
    542                exit(U_INVALID_TABLE_FORMAT);
    543            } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    544                fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
    545                    (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
    546                exit(U_INVALID_TABLE_FORMAT);
    547            }
    548        }
    549    }
    550 
    551    /* is this an SI/SO (like EBCDIC-stateful) state table? */
    552    if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    553        if(states->maxCharLength!=2) {
    554            fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
    555            exit(U_INVALID_TABLE_FORMAT);
    556        }
    557        if(states->countStates<3) {
    558            fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
    559            exit(U_INVALID_TABLE_FORMAT);
    560        }
    561        /* are the SI/SO all in the right places? */
    562        if( ignoreSISOCheck ||
    563           (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
    564            states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
    565            states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
    566            states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
    567        ) {
    568            states->outputType=MBCS_OUTPUT_2_SISO;
    569        } else {
    570            fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
    571            exit(U_INVALID_TABLE_FORMAT);
    572        }
    573        state=2;
    574    } else {
    575        state=1;
    576    }
    577 
    578    /* check that no unexpected state is a "direct" one */
    579    while(state<states->countStates) {
    580        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    581            fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
    582            exit(U_INVALID_TABLE_FORMAT);
    583        }
    584        ++state;
    585    }
    586 
    587    sumUpStates(states);
    588 }
    589 
    590 /* find a fallback for this offset; return the index or -1 if not found */
    591 U_CAPI int32_t U_EXPORT2
    592 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    593                 uint32_t offset) {
    594    int32_t i;
    595 
    596    if(countToUFallbacks==0) {
    597        /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
    598        return -1;
    599    }
    600 
    601    /* do a linear search for the fallback mapping (the table is not yet sorted) */
    602    for(i=0; i<countToUFallbacks; ++i) {
    603        if(offset==toUFallbacks[i].offset) {
    604            return i;
    605        }
    606    }
    607    return -1;
    608 }
    609 
    610 /*
    611 * This function tries to compact toUnicode tables for 2-byte codepages
    612 * by finding lead bytes with all-unassigned trail bytes and adding another state
    613 * for them.
    614 */
    615 static void
    616 compactToUnicode2(UCMStates *states,
    617                  uint16_t **pUnicodeCodeUnits,
    618                  _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    619                  UBool verbose) {
    620    int32_t (*oldStateTable)[256];
    621    uint16_t count[256];
    622    uint16_t *oldUnicodeCodeUnits;
    623    int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
    624    int32_t i, j, leadState, trailState, newState, fallback;
    625    uint16_t unit;
    626 
    627    /* find the lead state */
    628    if(states->outputType==MBCS_OUTPUT_2_SISO) {
    629        /* use the DBCS lead state for SI/SO codepages */
    630        leadState=1;
    631    } else {
    632        leadState=0;
    633    }
    634 
    635    /* find the main trail state: the most used target state */
    636    uprv_memset(count, 0, sizeof(count));
    637    for(i=0; i<256; ++i) {
    638        entry=states->stateTable[leadState][i];
    639        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    640            ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
    641        }
    642    }
    643    trailState=0;
    644    for(i=1; i<states->countStates; ++i) {
    645        if(count[i]>count[trailState]) {
    646            trailState=i;
    647        }
    648    }
    649 
    650    /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
    651    uprv_memset(count, 0, sizeof(count));
    652    savings=0;
    653    /* for each lead byte */
    654    for(i=0; i<256; ++i) {
    655        entry=states->stateTable[leadState][i];
    656        if(MBCS_ENTRY_IS_TRANSITION(entry) &&
    657                (MBCS_ENTRY_TRANSITION_STATE(entry))==static_cast<uint32_t>(trailState)) {
    658            /* the offset is different for each lead byte */
    659            offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
    660            /* for each trail byte for this lead byte */
    661            for(j=0; j<256; ++j) {
    662                entry=states->stateTable[trailState][j];
    663                switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    664                case MBCS_STATE_VALID_16:
    665                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    666                    if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
    667                        ++count[i];
    668                    } else {
    669                        j=999; /* do not count for this lead byte because there are assignments */
    670                    }
    671                    break;
    672                case MBCS_STATE_VALID_16_PAIR:
    673                    entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    674                    if((*pUnicodeCodeUnits)[entry]==0xfffe) {
    675                        count[i]+=2;
    676                    } else {
    677                        j=999; /* do not count for this lead byte because there are assignments */
    678                    }
    679                    break;
    680                default:
    681                    break;
    682                }
    683            }
    684            if(j==256) {
    685                /* all trail bytes for this lead byte are unassigned */
    686                savings+=count[i];
    687            } else {
    688                count[i]=0;
    689            }
    690        }
    691    }
    692    /* subtract from the possible savings the cost of an additional state */
    693    savings=savings*2-1024; /* count bytes, not 16-bit words */
    694    if(savings<=0) {
    695        return;
    696    }
    697    if(verbose) {
    698        printf("compacting toUnicode data saves %ld bytes\n", static_cast<long>(savings));
    699    }
    700    if(states->countStates>=MBCS_MAX_STATE_COUNT) {
    701        fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
    702        return;
    703    }
    704 
    705    /* make a copy of the state table */
    706    oldStateTable = static_cast<int32_t(*)[256]>(uprv_malloc(states->countStates * 1024));
    707    if(oldStateTable==nullptr) {
    708        fprintf(stderr, "cannot compact toUnicode: out of memory\n");
    709        return;
    710    }
    711    uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
    712 
    713    /* add the new state */
    714    /*
    715     * this function does not catch the degenerate case where all lead bytes
    716     * have all-unassigned trail bytes and the lead state could be removed
    717     */
    718    newState=states->countStates++;
    719    states->stateFlags[newState]=0;
    720    /* copy the old trail state, turning all assigned states into unassigned ones */
    721    for(i=0; i<256; ++i) {
    722        entry=states->stateTable[trailState][i];
    723        switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    724        case MBCS_STATE_VALID_16:
    725        case MBCS_STATE_VALID_16_PAIR:
    726            states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
    727            break;
    728        default:
    729            states->stateTable[newState][i]=entry;
    730            break;
    731        }
    732    }
    733 
    734    /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
    735    for(i=0; i<256; ++i) {
    736        if(count[i]>0) {
    737            states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
    738        }
    739    }
    740 
    741    /* sum up the new state table */
    742    for(i=0; i<states->countStates; ++i) {
    743        states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
    744    }
    745    sum=sumUpStates(states);
    746 
    747    /* allocate a new, smaller code units array */
    748    oldUnicodeCodeUnits=*pUnicodeCodeUnits;
    749    if(sum==0) {
    750        *pUnicodeCodeUnits=nullptr;
    751        if(oldUnicodeCodeUnits!=nullptr) {
    752            uprv_free(oldUnicodeCodeUnits);
    753        }
    754        uprv_free(oldStateTable);
    755        return;
    756    }
    757    *pUnicodeCodeUnits = static_cast<uint16_t*>(uprv_malloc(sum * sizeof(uint16_t)));
    758    if(*pUnicodeCodeUnits==nullptr) {
    759        fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
    760            static_cast<long>(sum));
    761        /* revert to the old state table */
    762        *pUnicodeCodeUnits=oldUnicodeCodeUnits;
    763        --states->countStates;
    764        uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
    765        uprv_free(oldStateTable);
    766        return;
    767    }
    768    for(i=0; i<sum; ++i) {
    769        (*pUnicodeCodeUnits)[i]=0xfffe;
    770    }
    771 
    772    /* copy the code units for all assigned characters */
    773    /*
    774     * The old state table has the same lead _and_ trail states for assigned characters!
    775     * The differences are in the offsets, and in the trail states for some unassigned characters.
    776     * For each character with an assigned state in the new table, it was assigned in the old one.
    777     * Only still-assigned characters are copied.
    778     * Note that fallback mappings need to get their offset values adjusted.
    779     */
    780 
    781    /* for each initial state */
    782    for(leadState=0; leadState<states->countStates; ++leadState) {
    783        if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    784            /* for each lead byte from there */
    785            for(i=0; i<256; ++i) {
    786                entry=states->stateTable[leadState][i];
    787                if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    788                    trailState = static_cast<uint8_t>(MBCS_ENTRY_TRANSITION_STATE(entry));
    789                    /* the new state does not have assigned states */
    790                    if(trailState!=newState) {
    791                        trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
    792                        oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
    793                        /* for each trail byte */
    794                        for(j=0; j<256; ++j) {
    795                            entry=states->stateTable[trailState][j];
    796                            /* copy assigned-character code units and adjust fallback offsets */
    797                            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    798                            case MBCS_STATE_VALID_16:
    799                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    800                                /* find the old offset according to the old state table */
    801                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
    802                                unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
    803                                if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
    804                                    toUFallbacks[fallback].offset=0x80000000|offset;
    805                                }
    806                                break;
    807                            case MBCS_STATE_VALID_16_PAIR:
    808                                offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    809                                /* find the old offset according to the old state table */
    810                                oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
    811                                (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
    812                                (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
    813                                break;
    814                            default:
    815                                break;
    816                            }
    817                        }
    818                    }
    819                }
    820            }
    821        }
    822    }
    823 
    824    /* remove temporary flags from fallback offsets that protected them from being modified twice */
    825    for(i=0; i<countToUFallbacks; ++i) {
    826        toUFallbacks[i].offset&=0x7fffffff;
    827    }
    828 
    829    /* free temporary memory */
    830    uprv_free(oldUnicodeCodeUnits);
    831    uprv_free(oldStateTable);
    832 }
    833 
    834 /*
    835 * recursive sub-function of compactToUnicodeHelper()
    836 * returns:
    837 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
    838 *    if all sequences from this state are unassigned, returns the
    839 * <0 there are assignments in unicodeCodeUnits[]
    840 * 0  no use of unicodeCodeUnits[]
    841 */
    842 static int32_t
    843 findUnassigned(UCMStates *states,
    844               uint16_t *unicodeCodeUnits,
    845               _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    846               int32_t state, int32_t offset, uint32_t b) {
    847    int32_t i, entry, savings, localSavings, belowSavings;
    848    UBool haveAssigned;
    849 
    850    localSavings=belowSavings=0;
    851    haveAssigned=false;
    852    for(i=0; i<256; ++i) {
    853        entry=states->stateTable[state][i];
    854        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    855            savings=findUnassigned(states,
    856                        unicodeCodeUnits,
    857                        toUFallbacks, countToUFallbacks,
    858                        MBCS_ENTRY_TRANSITION_STATE(entry),
    859                        offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
    860                        (b << 8) | static_cast<uint32_t>(i));
    861            if(savings<0) {
    862                haveAssigned=true;
    863            } else if(savings>0) {
    864                printf("    all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
    865                    static_cast<unsigned long>((b << 8) | i), static_cast<long>(state), static_cast<long>(savings));
    866                belowSavings+=savings;
    867            }
    868        } else if(!haveAssigned) {
    869            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    870            case MBCS_STATE_VALID_16:
    871                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    872                if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
    873                    localSavings+=2;
    874                } else {
    875                    haveAssigned=true;
    876                }
    877                break;
    878            case MBCS_STATE_VALID_16_PAIR:
    879                entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
    880                if(unicodeCodeUnits[entry]==0xfffe) {
    881                    localSavings+=4;
    882                } else {
    883                    haveAssigned=true;
    884                }
    885                break;
    886            default:
    887                break;
    888            }
    889        }
    890    }
    891    if(haveAssigned) {
    892        return -1;
    893    } else {
    894        return localSavings+belowSavings;
    895    }
    896 }
    897 
    898 /* helper function for finding compaction opportunities */
    899 static void
    900 compactToUnicodeHelper(UCMStates *states,
    901                       uint16_t *unicodeCodeUnits,
    902                       _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
    903    int32_t state, savings;
    904 
    905    /* for each initial state */
    906    for(state=0; state<states->countStates; ++state) {
    907        if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
    908            savings=findUnassigned(states,
    909                        unicodeCodeUnits,
    910                        toUFallbacks, countToUFallbacks,
    911                        state, 0, 0);
    912            if(savings>0) {
    913                printf("    all-unassigned sequences from initial state %ld use %ld bytes\n",
    914                    static_cast<long>(state), static_cast<long>(savings));
    915            }
    916        }
    917    }
    918 }
    919 
    920 U_CDECL_BEGIN
    921 static int32_t U_CALLCONV
    922 compareFallbacks(const void *context, const void *fb1, const void *fb2) {
    923    (void)context;
    924    return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
    925 }
    926 U_CDECL_END
    927 
    928 U_CAPI void U_EXPORT2
    929 ucm_optimizeStates(UCMStates *states,
    930                   uint16_t **pUnicodeCodeUnits,
    931                   _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
    932                   UBool verbose) {
    933    UErrorCode errorCode;
    934    int32_t state, cell, entry;
    935 
    936    /* test each state table entry */
    937    for(state=0; state<states->countStates; ++state) {
    938        for(cell=0; cell<256; ++cell) {
    939            entry=states->stateTable[state][cell];
    940            /*
    941             * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
    942             * and the code point is "unassigned" (0xfffe), then change it to
    943             * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
    944             */
    945            if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
    946                states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
    947            }
    948        }
    949    }
    950 
    951    /* try to compact the toUnicode tables */
    952    if(states->maxCharLength==2) {
    953        compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
    954    } else if(states->maxCharLength>2) {
    955        if(verbose) {
    956            compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
    957        }
    958    }
    959 
    960    /* sort toUFallbacks */
    961    /*
    962     * It should be safe to sort them before compactToUnicode2() is called,
    963     * because it should not change the relative order of the offset values
    964     * that it adjusts, but they need to be sorted at some point, and
    965     * it is safest here.
    966     */
    967    if(countToUFallbacks>0) {
    968        errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
    969        uprv_sortArray(toUFallbacks, countToUFallbacks,
    970                       sizeof(_MBCSToUFallback),
    971                       compareFallbacks, nullptr, false, &errorCode);
    972    }
    973 }
    974 
    975 /* use a complete state table ----------------------------------------------- */
    976 
    977 U_CAPI int32_t U_EXPORT2
    978 ucm_countChars(UCMStates *states,
    979               const uint8_t *bytes, int32_t length) {
    980    uint32_t offset;
    981    int32_t i, entry, count;
    982    uint8_t state;
    983 
    984    offset=0;
    985    count=0;
    986    state=0;
    987 
    988    if(states->countStates==0) {
    989        fprintf(stderr, "ucm error: there is no state information!\n");
    990        return -1;
    991    }
    992 
    993    /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
    994    if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
    995        state=1;
    996    }
    997 
    998    /*
    999     * Walk down the state table like in conversion,
   1000     * much like getNextUChar().
   1001     * We assume that c<=0x10ffff.
   1002     */
   1003    for(i=0; i<length; ++i) {
   1004        entry=states->stateTable[state][bytes[i]];
   1005        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
   1006            state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
   1007            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
   1008        } else {
   1009            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
   1010            case MBCS_STATE_ILLEGAL:
   1011                fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
   1012                return -1;
   1013            case MBCS_STATE_CHANGE_ONLY:
   1014                fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
   1015                return -1;
   1016            case MBCS_STATE_UNASSIGNED:
   1017            case MBCS_STATE_FALLBACK_DIRECT_16:
   1018            case MBCS_STATE_VALID_DIRECT_16:
   1019            case MBCS_STATE_FALLBACK_DIRECT_20:
   1020            case MBCS_STATE_VALID_DIRECT_20:
   1021            case MBCS_STATE_VALID_16:
   1022            case MBCS_STATE_VALID_16_PAIR:
   1023                /* count a complete character and prepare for a new one */
   1024                ++count;
   1025                state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
   1026                offset=0;
   1027                break;
   1028            default:
   1029                /* reserved, must never occur */
   1030                fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
   1031                return -1;
   1032            }
   1033        }
   1034    }
   1035 
   1036    if(offset!=0) {
   1037        fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
   1038        return -1;
   1039    }
   1040 
   1041    /*
   1042     * for SI/SO (like EBCDIC-stateful), multiple-character results
   1043     * must consist of only double-byte sequences
   1044     */
   1045    if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
   1046        fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
   1047        return -1;
   1048    }
   1049 
   1050    return count;
   1051 }
   1052 #endif