tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

genmbcs.cpp (59343B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2000-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  genmbcs.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2000jul06
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include <stdio.h>
     20 #include "unicode/utypes.h"
     21 #include "cstring.h"
     22 #include "cmemory.h"
     23 #include "unewdata.h"
     24 #include "ucnv_cnv.h"
     25 #include "ucnvmbcs.h"
     26 #include "ucm.h"
     27 #include "makeconv.h"
     28 #include "genmbcs.h"
     29 #include "toolutil.h"
     30 
     31 /*
     32 * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files.
     33 * Reduce tests for maxCharLength.
     34 */
     35 
     36 struct MBCSData {
     37    NewConverter newConverter;
     38 
     39    UCMFile *ucm;
     40 
     41    /* toUnicode (state table in ucm->states) */
     42    _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
     43    int32_t countToUFallbacks;
     44    uint16_t *unicodeCodeUnits;
     45 
     46    /* fromUnicode */
     47    uint16_t stage1[MBCS_STAGE_1_SIZE];
     48    uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
     49    uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
     50    uint8_t *fromUBytes;
     51    uint32_t stage2Top, stage3Top;
     52 
     53    /* fromUTF8 */
     54    uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT];  /* allow for utf8Max=0xffff */
     55 
     56    /*
     57     * Maximum UTF-8-friendly code point.
     58     * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
     59     * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
     60     */
     61    uint16_t utf8Max;
     62 
     63    UBool utf8Friendly;
     64    UBool omitFromU;
     65 };
     66 
     67 /* prototypes */
     68 U_CDECL_BEGIN
     69 static void
     70 MBCSClose(NewConverter *cnvData);
     71 
     72 static UBool
     73 MBCSStartMappings(MBCSData *mbcsData);
     74 
     75 static UBool
     76 MBCSAddToUnicode(MBCSData *mbcsData,
     77                 const uint8_t *bytes, int32_t length,
     78                 UChar32 c,
     79                 int8_t flag);
     80 
     81 static UBool
     82 MBCSIsValid(NewConverter *cnvData,
     83            const uint8_t *bytes, int32_t length);
     84 
     85 static UBool
     86 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
     87                         const uint8_t *bytes, int32_t length,
     88                         UChar32 c,
     89                         int8_t flag);
     90 
     91 static UBool
     92 MBCSAddFromUnicode(MBCSData *mbcsData,
     93                   const uint8_t *bytes, int32_t length,
     94                   UChar32 c,
     95                   int8_t flag);
     96 
     97 static void
     98 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
     99 
    100 static UBool
    101 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
    102 
    103 static uint32_t
    104 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
    105          UNewDataMemory *pData, int32_t tableType);
    106 U_CDECL_END
    107 
    108 /* helper ------------------------------------------------------------------- */
    109 
    110 static inline char
    111 hexDigit(uint8_t digit) {
    112    return digit <= 9 ? static_cast<char>('0' + digit) : static_cast<char>('a' - 10 + digit);
    113 }
    114 
    115 static inline char *
    116 printBytes(char *buffer, size_t bufferLength, const uint8_t *bytes, int32_t length) {
    117    char *s=buffer;
    118    while(length>0 && (static_cast<size_t>(s-buffer) < bufferLength-3)) {
    119        *s++ = hexDigit(static_cast<uint8_t>(*bytes >> 4));
    120        *s++ = hexDigit(static_cast<uint8_t>(*bytes & 0xf));
    121        ++bytes;
    122        --length;
    123    }
    124 
    125    *s=0;
    126    return buffer;
    127 }
    128 
    129 /* implementation ----------------------------------------------------------- */
    130 
    131 static MBCSData gDummy;
    132 
    133 
    134 U_CFUNC const MBCSData *
    135 MBCSGetDummy() {
    136    uprv_memset(&gDummy, 0, sizeof(MBCSData));
    137 
    138    /*
    139     * Set "pessimistic" values which may sometimes move too many
    140     * mappings to the extension table (but never too few).
    141     * These values cause MBCSOkForBaseFromUnicode() to return false for the
    142     * largest set of mappings.
    143     * Assume maxCharLength>1.
    144     */
    145    gDummy.utf8Friendly=true;
    146    if(SMALL) {
    147        gDummy.utf8Max=0xffff;
    148        gDummy.omitFromU=true;
    149    } else {
    150        gDummy.utf8Max=MBCS_UTF8_MAX;
    151    }
    152    return &gDummy;
    153 }
    154 
    155 static void
    156 MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
    157    uprv_memset(mbcsData, 0, sizeof(MBCSData));
    158 
    159    mbcsData->ucm=ucm; /* aliased, not owned */
    160 
    161    mbcsData->newConverter.close=MBCSClose;
    162    mbcsData->newConverter.isValid=MBCSIsValid;
    163    mbcsData->newConverter.addTable=MBCSAddTable;
    164    mbcsData->newConverter.write=MBCSWrite;
    165 }
    166 
    167 U_CFUNC NewConverter *
    168 MBCSOpen(UCMFile *ucm) {
    169    MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
    170    if(mbcsData==nullptr) {
    171        printf("out of memory\n");
    172        exit(U_MEMORY_ALLOCATION_ERROR);
    173    }
    174 
    175    MBCSInit(mbcsData, ucm);
    176    return &mbcsData->newConverter;
    177 }
    178 
    179 static void
    180 MBCSDestruct(MBCSData *mbcsData) {
    181    uprv_free(mbcsData->unicodeCodeUnits);
    182    uprv_free(mbcsData->fromUBytes);
    183 }
    184 
    185 U_CDECL_BEGIN
    186 static void
    187 MBCSClose(NewConverter *cnvData) {
    188    MBCSData *mbcsData=(MBCSData *)cnvData;
    189    if(mbcsData!=nullptr) {
    190        MBCSDestruct(mbcsData);
    191        uprv_free(mbcsData);
    192    }
    193 }
    194 U_CDECL_END
    195 
    196 static UBool
    197 MBCSStartMappings(MBCSData *mbcsData) {
    198    int32_t i, sum, maxCharLength,
    199            stage2NullLength, stage2AllocLength,
    200            stage3NullLength, stage3AllocLength;
    201 
    202    /* toUnicode */
    203 
    204    /* allocate the code unit array and prefill it with "unassigned" values */
    205    sum=mbcsData->ucm->states.countToUCodeUnits;
    206    if(VERBOSE) {
    207        printf("the total number of offsets is 0x%lx=%ld\n", static_cast<long>(sum), static_cast<long>(sum));
    208    }
    209 
    210    if(sum>0) {
    211        mbcsData->unicodeCodeUnits = static_cast<uint16_t*>(uprv_malloc(sum * sizeof(uint16_t)));
    212        if(mbcsData->unicodeCodeUnits==nullptr) {
    213            fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
    214                static_cast<long>(sum));
    215            return false;
    216        }
    217        for(i=0; i<sum; ++i) {
    218            mbcsData->unicodeCodeUnits[i]=0xfffe;
    219        }
    220    }
    221 
    222    /* fromUnicode */
    223    maxCharLength=mbcsData->ucm->states.maxCharLength;
    224 
    225    /* allocate the codepage mappings and preset the first 16 characters to 0 */
    226    if(maxCharLength==1) {
    227        /* allocate 64k 16-bit results for single-byte codepages */
    228        sum=0x20000;
    229    } else {
    230        /* allocate 1M * maxCharLength bytes for at most 1M mappings */
    231        sum=0x100000*maxCharLength;
    232    }
    233    mbcsData->fromUBytes = static_cast<uint8_t*>(uprv_malloc(sum));
    234    if(mbcsData->fromUBytes==nullptr) {
    235        fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", static_cast<long>(sum));
    236        return false;
    237    }
    238    uprv_memset(mbcsData->fromUBytes, 0, sum);
    239 
    240    /*
    241     * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time.
    242     * See ucnvmbcs.h for details.
    243     *
    244     * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which
    245     * assumes that the initial stage 2/3 blocks are the all-unassigned ones.
    246     * Therefore, we refine the data structure while maintaining this placement
    247     * even though it would be convenient to allocate the ASCII block at the
    248     * beginning of stage 3, for example.
    249     *
    250     * UTF-8-friendly fromUnicode tries work from sorted tables and are built
    251     * pre-compacted, overlapping adjacent stage 2/3 blocks.
    252     * This is necessary because the block allocation and compaction changes
    253     * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional
    254     * stage table uses direct indexes into stage 3, without a multiplier and
    255     * thus with a smaller reach.
    256     *
    257     * Non-UTF-8-friendly fromUnicode tries work from unsorted tables
    258     * (because implicit precision is used), and are compacted
    259     * in post-processing.
    260     *
    261     * Preallocation for UTF-8-friendly fromUnicode tries:
    262     *
    263     * Stage 3:
    264     * 64-entry all-unassigned first block followed by ASCII (128 entries).
    265     *
    266     * Stage 2:
    267     * 64-entry all-unassigned first block followed by preallocated
    268     * 64-block for ASCII.
    269     */
    270 
    271    /* Preallocate ASCII as a linear 128-entry stage 3 block. */
    272    stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE;
    273    stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE;
    274 
    275    stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    276    stage3AllocLength=128; /* ASCII U+0000..U+007f */
    277 
    278    /* Initialize stage 1 for the preallocated blocks. */
    279    sum=stage2NullLength;
    280    for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) {
    281        mbcsData->stage1[i]=sum;
    282        sum+=MBCS_STAGE_2_BLOCK_SIZE;
    283    }
    284    mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */
    285 
    286    /*
    287     * Stage 2 indexes count 16-blocks in stage 3 as follows:
    288     * SBCS: directly, indexes increment by 16
    289     * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1
    290     * MBCS UTF-8: directly, indexes increment by 16
    291     */
    292    if(maxCharLength==1) {
    293        sum=stage3NullLength;
    294        for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
    295            mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum;
    296            sum+=MBCS_STAGE_3_BLOCK_SIZE;
    297        }
    298    } else {
    299        sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY;
    300        for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) {
    301            mbcsData->stage2[mbcsData->stage1[0]+i]=sum;
    302            sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY;
    303        }
    304    }
    305 
    306    sum=stage3NullLength;
    307    for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) {
    308        mbcsData->stageUTF8[i]=sum;
    309        sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    310    }
    311 
    312    /*
    313     * Allocate a 64-entry all-unassigned first stage 3 block,
    314     * for UTF-8-friendly lookup with a trail byte,
    315     * plus 128 entries for ASCII.
    316     */
    317    mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */
    318 
    319    return true;
    320 }
    321 
    322 /* return true for success */
    323 static UBool
    324 setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
    325    int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
    326    if(i>=0) {
    327        /* if there is already a fallback for this offset, then overwrite it */
    328        mbcsData->toUFallbacks[i].codePoint=c;
    329        return true;
    330    } else {
    331        /* if there is no fallback for this offset, then add one */
    332        i=mbcsData->countToUFallbacks;
    333        if(i>=MBCS_MAX_FALLBACK_COUNT) {
    334            fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", static_cast<int>(c));
    335            return false;
    336        } else {
    337            mbcsData->toUFallbacks[i].offset=offset;
    338            mbcsData->toUFallbacks[i].codePoint=c;
    339            mbcsData->countToUFallbacks=i+1;
    340            return true;
    341        }
    342    }
    343 }
    344 
    345 /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
    346 static int32_t
    347 removeFallback(MBCSData *mbcsData, uint32_t offset) {
    348    int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
    349    if(i>=0) {
    350        _MBCSToUFallback *toUFallbacks;
    351        int32_t limit, old;
    352 
    353        toUFallbacks=mbcsData->toUFallbacks;
    354        limit=mbcsData->countToUFallbacks;
    355        old = static_cast<int32_t>(toUFallbacks[i].codePoint);
    356 
    357        /* copy the last fallback entry here to keep the list contiguous */
    358        toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
    359        toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
    360        mbcsData->countToUFallbacks=limit-1;
    361        return old;
    362    } else {
    363        return -1;
    364    }
    365 }
    366 
    367 /*
    368 * isFallback is almost a boolean:
    369 * 1 (true)  this is a fallback mapping
    370 * 0 (false) this is a precise mapping
    371 * -1        the precision of this mapping is not specified
    372 */
    373 static UBool
    374 MBCSAddToUnicode(MBCSData *mbcsData,
    375                 const uint8_t *bytes, int32_t length,
    376                 UChar32 c,
    377                 int8_t flag) {
    378    char buffer[10];
    379    uint32_t offset=0;
    380    int32_t i=0, entry, old;
    381    uint8_t state=0;
    382 
    383    if(mbcsData->ucm->states.countStates==0) {
    384        fprintf(stderr, "error: there is no state information!\n");
    385        return false;
    386    }
    387 
    388    /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
    389    if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
    390        state=1;
    391    }
    392 
    393    /*
    394     * Walk down the state table like in conversion,
    395     * much like getNextUChar().
    396     * We assume that c<=0x10ffff.
    397     */
    398    for(i=0;;) {
    399        entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
    400        if(MBCS_ENTRY_IS_TRANSITION(entry)) {
    401            if(i==length) {
    402                fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
    403                    static_cast<short>(state), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(c));
    404                return false;
    405            }
    406            state = static_cast<uint8_t>(MBCS_ENTRY_TRANSITION_STATE(entry));
    407            offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
    408        } else {
    409            if(i<length) {
    410                fprintf(stderr, "error: byte sequence too long by %d bytes, final state %u: 0x%s (U+%x)\n",
    411                    static_cast<int>(length - i), state, printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(c));
    412                return false;
    413            }
    414            switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
    415            case MBCS_STATE_ILLEGAL:
    416                fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
    417                    static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    418                return false;
    419            case MBCS_STATE_CHANGE_ONLY:
    420                fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
    421                    static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    422                return false;
    423            case MBCS_STATE_UNASSIGNED:
    424                fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
    425                    static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    426                return false;
    427            case MBCS_STATE_FALLBACK_DIRECT_16:
    428            case MBCS_STATE_VALID_DIRECT_16:
    429            case MBCS_STATE_FALLBACK_DIRECT_20:
    430            case MBCS_STATE_VALID_DIRECT_20:
    431                if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
    432                    /* the "direct" action's value is not "valid-direct-16-unassigned" any more */
    433                    if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
    434                        old=MBCS_ENTRY_FINAL_VALUE(entry);
    435                    } else {
    436                        old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
    437                    }
    438                    if(flag>=0) {
    439                        fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    440                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    441                        return false;
    442                    } else if(VERBOSE) {
    443                        fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    444                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    445                    }
    446                    /*
    447                     * Continue after the above warning
    448                     * if the precision of the mapping is unspecified.
    449                     */
    450                }
    451                /* reassign the correct action code */
    452                entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
    453 
    454                /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
    455                if(c<=0xffff) {
    456                    entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
    457                } else {
    458                    entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
    459                }
    460                mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
    461                break;
    462            case MBCS_STATE_VALID_16:
    463                /* bits 26..16 are not used, 0 */
    464                /* bits 15..7 contain the final offset delta to one 16-bit code unit */
    465                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
    466                /* check that this byte sequence is still unassigned */
    467                if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
    468                    if(flag>=0) {
    469                        fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    470                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    471                        return false;
    472                    } else if(VERBOSE) {
    473                        fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    474                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    475                    }
    476                }
    477                if(c>=0x10000) {
    478                    fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
    479                        static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    480                    return false;
    481                }
    482                if(flag>0) {
    483                    /* assign only if there is no precise mapping */
    484                    if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
    485                        return setFallback(mbcsData, offset, c);
    486                    }
    487                } else {
    488                    mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(c);
    489                }
    490                break;
    491            case MBCS_STATE_VALID_16_PAIR:
    492                /* bits 26..16 are not used, 0 */
    493                /* bits 15..7 contain the final offset delta to two 16-bit code units */
    494                offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
    495                /* check that this byte sequence is still unassigned */
    496                old=mbcsData->unicodeCodeUnits[offset];
    497                if(old<0xfffe) {
    498                    int32_t real;
    499                    if(old<0xd800) {
    500                        real=old;
    501                    } else if(old<=0xdfff) {
    502                        real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
    503                    } else /* old<=0xe001 */ {
    504                        real=mbcsData->unicodeCodeUnits[offset+1];
    505                    }
    506                    if(flag>=0) {
    507                        fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    508                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(real));
    509                        return false;
    510                    } else if(VERBOSE) {
    511                        fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n",
    512                            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(real));
    513                    }
    514                }
    515                if(flag>0) {
    516                    /* assign only if there is no precise mapping */
    517                    if(old<=0xdbff || old==0xe000) {
    518                        /* do nothing */
    519                    } else if(c<=0xffff) {
    520                        /* set a BMP fallback code point as a pair with 0xe001 */
    521                        mbcsData->unicodeCodeUnits[offset++]=0xe001;
    522                        mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(c);
    523                    } else {
    524                        /* set a fallback surrogate pair with two second surrogates */
    525                        mbcsData->unicodeCodeUnits[offset++] = static_cast<uint16_t>(0xdbc0 + (c >> 10));
    526                        mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(0xdc00 + (c & 0x3ff));
    527                    }
    528                } else {
    529                    if(c<0xd800) {
    530                        /* set a BMP code point */
    531                        mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(c);
    532                    } else if(c<=0xffff) {
    533                        /* set a BMP code point above 0xd800 as a pair with 0xe000 */
    534                        mbcsData->unicodeCodeUnits[offset++]=0xe000;
    535                        mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(c);
    536                    } else {
    537                        /* set a surrogate pair */
    538                        mbcsData->unicodeCodeUnits[offset++] = static_cast<uint16_t>(0xd7c0 + (c >> 10));
    539                        mbcsData->unicodeCodeUnits[offset] = static_cast<uint16_t>(0xdc00 + (c & 0x3ff));
    540                    }
    541                }
    542                break;
    543            default:
    544                /* reserved, must never occur */
    545                fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n",
    546                    static_cast<int>(entry), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(c));
    547                return false;
    548            }
    549 
    550            return true;
    551        }
    552    }
    553 }
    554 
    555 U_CDECL_BEGIN
    556 /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
    557 static UBool
    558 MBCSIsValid(NewConverter *cnvData,
    559            const uint8_t *bytes, int32_t length) {
    560    MBCSData *mbcsData=(MBCSData *)cnvData;
    561 
    562    return 1==ucm_countChars(&mbcsData->ucm->states, bytes, length);
    563 }
    564 U_CDECL_END
    565 static UBool
    566 MBCSSingleAddFromUnicode(MBCSData *mbcsData,
    567                         const uint8_t *bytes, int32_t /*length*/,
    568                         UChar32 c,
    569                         int8_t flag) {
    570    uint16_t *stage3, *p;
    571    uint32_t idx;
    572    uint16_t old;
    573    uint8_t b;
    574 
    575    uint32_t blockSize, newTop, i, nextOffset, newBlock, min;
    576 
    577    /* ignore |2 SUB mappings */
    578    if(flag==2) {
    579        return true;
    580    }
    581 
    582    /*
    583     * Walk down the triple-stage compact array ("trie") and
    584     * allocate parts as necessary.
    585     * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
    586     * We assume that length<=maxCharLength and that c<=0x10ffff.
    587     */
    588    stage3 = reinterpret_cast<uint16_t*>(mbcsData->fromUBytes);
    589    b=*bytes;
    590 
    591    /* inspect stage 1 */
    592    idx=c>>MBCS_STAGE_1_SHIFT;
    593    if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
    594        nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
    595    } else {
    596        nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
    597    }
    598    if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
    599        /* allocate another block in stage 2 */
    600        newBlock=mbcsData->stage2Top;
    601        if(mbcsData->utf8Friendly) {
    602            min=newBlock-nextOffset; /* minimum block start with overlap */
    603            while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) {
    604                --newBlock;
    605            }
    606        }
    607        newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
    608 
    609        if(newTop>MBCS_MAX_STAGE_2_TOP) {
    610            fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", static_cast<int>(c), b);
    611            return false;
    612        }
    613 
    614        /*
    615         * each stage 2 block contains 64 16-bit words:
    616         * 6 code point bits 9..4 with 1 stage 3 index
    617         */
    618        mbcsData->stage1[idx] = static_cast<uint16_t>(newBlock);
    619        mbcsData->stage2Top=newTop;
    620    }
    621 
    622    /* inspect stage 2 */
    623    idx=mbcsData->stage1[idx]+nextOffset;
    624    if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) {
    625        /* allocate 64-entry blocks for UTF-8-friendly lookup */
    626        blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE;
    627        nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
    628    } else {
    629        blockSize=MBCS_STAGE_3_BLOCK_SIZE;
    630        nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
    631    }
    632    if(mbcsData->stage2Single[idx]==0) {
    633        /* allocate another block in stage 3 */
    634        newBlock=mbcsData->stage3Top;
    635        if(mbcsData->utf8Friendly) {
    636            min=newBlock-nextOffset; /* minimum block start with overlap */
    637            while(min<newBlock && stage3[newBlock-1]==0) {
    638                --newBlock;
    639            }
    640        }
    641        newTop=newBlock+blockSize;
    642 
    643        if(newTop>MBCS_STAGE_3_SBCS_SIZE) {
    644            fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", static_cast<int>(c), b);
    645            return false;
    646        }
    647        /* each block has 16 uint16_t entries */
    648        i=idx;
    649        while(newBlock<newTop) {
    650            mbcsData->stage2Single[i++] = static_cast<uint16_t>(newBlock);
    651            newBlock+=MBCS_STAGE_3_BLOCK_SIZE;
    652        }
    653        mbcsData->stage3Top=newTop; /* ==newBlock */
    654    }
    655 
    656    /* write the codepage entry into stage 3 and get the previous entry */
    657    p=stage3+mbcsData->stage2Single[idx]+nextOffset;
    658    old=*p;
    659    if(flag<=0) {
    660        *p = static_cast<uint16_t>(0xf00 | b);
    661    } else if(IS_PRIVATE_USE(c)) {
    662        *p = static_cast<uint16_t>(0xc00 | b);
    663    } else {
    664        *p = static_cast<uint16_t>(0x800 | b);
    665    }
    666 
    667    /* check that this Unicode code point was still unassigned */
    668    if(old>=0x100) {
    669        if(flag>=0) {
    670            fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
    671                static_cast<int>(c), b, old & 0xff);
    672            return false;
    673        } else if(VERBOSE) {
    674            fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
    675                static_cast<int>(c), b, old & 0xff);
    676        }
    677        /* continue after the above warning if the precision of the mapping is unspecified */
    678    }
    679 
    680    return true;
    681 }
    682 
    683 static UBool
    684 MBCSAddFromUnicode(MBCSData *mbcsData,
    685                   const uint8_t *bytes, int32_t length,
    686                   UChar32 c,
    687                   int8_t flag) {
    688    char buffer[10];
    689    const uint8_t *pb;
    690    uint8_t *stage3, *p;
    691    uint32_t idx, b, old, stage3Index;
    692    int32_t maxCharLength;
    693 
    694    uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap;
    695 
    696    maxCharLength=mbcsData->ucm->states.maxCharLength;
    697 
    698    if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
    699        (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf))
    700    ) {
    701        fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
    702            static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    703        return false;
    704    }
    705 
    706    if(flag==1 && length==1 && *bytes==0) {
    707        fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
    708            static_cast<int>(c), *bytes);
    709        return false;
    710    }
    711 
    712    /*
    713     * Walk down the triple-stage compact array ("trie") and
    714     * allocate parts as necessary.
    715     * Note that the first stage 2 and 3 blocks are reserved for
    716     * all-unassigned mappings.
    717     * We assume that length<=maxCharLength and that c<=0x10ffff.
    718     */
    719    stage3=mbcsData->fromUBytes;
    720 
    721    /* inspect stage 1 */
    722    idx=c>>MBCS_STAGE_1_SHIFT;
    723    if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    724        nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
    725    } else {
    726        nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
    727    }
    728    if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
    729        /* allocate another block in stage 2 */
    730        newBlock=mbcsData->stage2Top;
    731        if(mbcsData->utf8Friendly) {
    732            min=newBlock-nextOffset; /* minimum block start with overlap */
    733            while(min<newBlock && mbcsData->stage2[newBlock-1]==0) {
    734                --newBlock;
    735            }
    736        }
    737        newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
    738 
    739        if(newTop>MBCS_MAX_STAGE_2_TOP) {
    740            fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
    741                static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    742            return false;
    743        }
    744 
    745        /*
    746         * each stage 2 block contains 64 32-bit words:
    747         * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
    748         */
    749        i=idx;
    750        while(newBlock<newTop) {
    751            mbcsData->stage1[i++] = static_cast<uint16_t>(newBlock);
    752            newBlock+=MBCS_STAGE_2_BLOCK_SIZE;
    753        }
    754        mbcsData->stage2Top=newTop; /* ==newBlock */
    755    }
    756 
    757    /* inspect stage 2 */
    758    idx=mbcsData->stage1[idx]+nextOffset;
    759    if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    760        /* allocate 64-entry blocks for UTF-8-friendly lookup */
    761        blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
    762        nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
    763    } else {
    764        blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
    765        nextOffset=c&MBCS_STAGE_3_BLOCK_MASK;
    766    }
    767    if(mbcsData->stage2[idx]==0) {
    768        /* allocate another block in stage 3 */
    769        newBlock=mbcsData->stage3Top;
    770        if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) {
    771            /*
    772             * Overlap stage 3 blocks only in multiples of 16-entry blocks
    773             * because of the indexing granularity in stage 2.
    774             */
    775            maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength;
    776            for(overlap=0;
    777                overlap<maxOverlap && stage3[newBlock-overlap-1]==0;
    778                ++overlap) {}
    779 
    780            overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
    781            overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength;
    782 
    783            newBlock-=overlap;
    784        }
    785        newTop=newBlock+blockSize;
    786 
    787        if (newTop > MBCS_STAGE_3_MBCS_SIZE * static_cast<uint32_t>(maxCharLength)) {
    788            fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
    789                static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length));
    790            return false;
    791        }
    792        /* each block has 16*maxCharLength bytes */
    793        i=idx;
    794        while(newBlock<newTop) {
    795            mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength;
    796            newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength;
    797        }
    798        mbcsData->stage3Top=newTop; /* ==newBlock */
    799    }
    800 
    801    stage3Index = MBCS_STAGE_3_GRANULARITY * static_cast<uint32_t>(static_cast<uint16_t>(mbcsData->stage2[idx]));
    802 
    803    /* Build an alternate, UTF-8-friendly stage table as well. */
    804    if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
    805        /* Overflow for uint16_t entries in stageUTF8? */
    806        if(stage3Index>0xffff) {
    807            /*
    808             * This can occur only if the mapping table is nearly perfectly filled and if
    809             * utf8Max==0xffff.
    810             * (There is no known charset like this. GB 18030 does not map
    811             * surrogate code points and LMBCS does not map 256 PUA code points.)
    812             *
    813             * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff
    814             * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT)
    815             * because we have a sorted table and there are at most MBCS_UTF8_LIMIT
    816             * mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
    817             * the initial all-unassigned block in stage3.
    818             *
    819             * Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
    820             *
    821             * (See svn revision 20866 of the markus/ucnvutf8 feature branch for
    822             * code that causes MBCSAddTable() to rebuild the table not utf8Friendly
    823             * in case of overflow. That code was not tested.)
    824             */
    825            mbcsData->utf8Max=0xfeff;
    826        } else {
    827            /*
    828             * The stage 3 block has been assigned for the regular trie.
    829             * Just copy its index into stageUTF8[], without the granularity.
    830             */
    831            mbcsData->stageUTF8[c >> MBCS_UTF8_STAGE_SHIFT] = static_cast<uint16_t>(stage3Index);
    832        }
    833    }
    834 
    835    /* write the codepage bytes into stage 3 and get the previous bytes */
    836 
    837    /* assemble the bytes into a single integer */
    838    pb=bytes;
    839    b=0;
    840    switch(length) {
    841    case 4:
    842        b=*pb++;
    843        U_FALLTHROUGH;
    844    case 3:
    845        b=(b<<8)|*pb++;
    846        U_FALLTHROUGH;
    847    case 2:
    848        b=(b<<8)|*pb++;
    849        U_FALLTHROUGH;
    850    case 1:
    851    default:
    852        b=(b<<8)|*pb++;
    853        break;
    854    }
    855 
    856    old=0;
    857    p=stage3+(stage3Index+nextOffset)*maxCharLength;
    858    switch(maxCharLength) {
    859    case 2:
    860        old = *reinterpret_cast<uint16_t*>(p);
    861        *reinterpret_cast<uint16_t*>(p) = static_cast<uint16_t>(b);
    862        break;
    863    case 3:
    864        old = static_cast<uint32_t>(*p) << 16;
    865        *p++ = static_cast<uint8_t>(b >> 16);
    866        old |= static_cast<uint32_t>(*p) << 8;
    867        *p++ = static_cast<uint8_t>(b >> 8);
    868        old|=*p;
    869        *p = static_cast<uint8_t>(b);
    870        break;
    871    case 4:
    872        old = *reinterpret_cast<uint32_t*>(p);
    873        *reinterpret_cast<uint32_t*>(p) = b;
    874        break;
    875    default:
    876        /* will never occur */
    877        break;
    878    }
    879 
    880    /* check that this Unicode code point was still unassigned */
    881    if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
    882        if(flag>=0) {
    883            fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
    884                static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    885            return false;
    886        } else if(VERBOSE) {
    887            fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n",
    888                static_cast<int>(c), printBytes(buffer, sizeof(buffer), bytes, length), static_cast<int>(old));
    889        }
    890        /* continue after the above warning if the precision of the mapping is
    891           unspecified */
    892    }
    893    if(flag<=0) {
    894        /* set the roundtrip flag */
    895        mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf)));
    896    }
    897 
    898    return true;
    899 }
    900 
    901 U_CFUNC UBool
    902 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
    903                         const uint8_t *bytes, int32_t length,
    904                         UChar32 c, int8_t flag) {
    905    /*
    906     * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under
    907     * the following conditions:
    908     *
    909     * - a |2 SUB mapping for <subchar1> (no base table data structure for them)
    910     * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry)
    911     * - a multi-byte mapping with leading 0x00 bytes (no explicit length field)
    912     *
    913     * Some of these tests are redundant with ucm_mappingType().
    914     */
    915    if( (flag==2 && length==1) ||
    916        (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */
    917        (flag<=1 && length>1 && bytes[0]==0)
    918    ) {
    919        return false;
    920    }
    921 
    922    /*
    923     * Additional restrictions for UTF-8-friendly fromUnicode tables,
    924     * for code points up to the maximum optimized one:
    925     *
    926     * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
    927     * - any |1 fallback (no roundtrip flags in the optimized table)
    928     */
    929    if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
    930        return false;
    931    }
    932 
    933    /*
    934     * If we omit the fromUnicode data, we can only store roundtrips there
    935     * because only they are recoverable from the toUnicode data.
    936     * Fallbacks must go into the extension table.
    937     */
    938    if(mbcsData->omitFromU && flag!=0) {
    939        return false;
    940    }
    941 
    942    /* All other mappings do fit into the base table. */
    943    return true;
    944 }
    945 
    946 U_CDECL_BEGIN
    947 /* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
    948 static UBool
    949 MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
    950    MBCSData *mbcsData;
    951    UCMapping *m;
    952    UChar32 c;
    953    int32_t i, maxCharLength;
    954    int8_t f;
    955    UBool isOK, utf8Friendly;
    956 
    957    staticData->unicodeMask=table->unicodeMask;
    958    if(staticData->unicodeMask==3) {
    959        fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
    960        return false;
    961    }
    962 
    963    staticData->conversionType=UCNV_MBCS;
    964 
    965    mbcsData=(MBCSData *)cnvData;
    966    maxCharLength=mbcsData->ucm->states.maxCharLength;
    967 
    968    /*
    969     * Generation of UTF-8-friendly data requires
    970     * a sorted table, which makeconv generates when explicit precision
    971     * indicators are used.
    972     */
    973    mbcsData->utf8Friendly = utf8Friendly = (table->flagsType & UCM_FLAGS_EXPLICIT) != 0;
    974    if(utf8Friendly) {
    975        mbcsData->utf8Max=MBCS_UTF8_MAX;
    976        if(SMALL && maxCharLength>1) {
    977            mbcsData->omitFromU=true;
    978        }
    979    } else {
    980        mbcsData->utf8Max=0;
    981        if(SMALL && maxCharLength>1) {
    982            fprintf(stderr,
    983                "makeconv warning: --small not available for .ucm files without |0 etc.\n");
    984        }
    985    }
    986 
    987    if(!MBCSStartMappings(mbcsData)) {
    988        return false;
    989    }
    990 
    991    staticData->hasFromUnicodeFallback=false;
    992    staticData->hasToUnicodeFallback=false;
    993 
    994    isOK=true;
    995 
    996    m=table->mappings;
    997    for(i=0; i<table->mappingsLength; ++m, ++i) {
    998        c=m->u;
    999        f=m->f;
   1000 
   1001        /*
   1002         * Small optimization for --small .cnv files:
   1003         *
   1004         * If there are fromUnicode mappings above MBCS_UTF8_MAX,
   1005         * then the file size will be smaller if we make utf8Max larger
   1006         * because the size increase in stageUTF8 will be more than balanced by
   1007         * how much less of stage2 needs to be stored.
   1008         *
   1009         * There is no point in doing this incrementally because stageUTF8
   1010         * uses so much less space per block than stage2,
   1011         * so we immediately increase utf8Max to 0xffff.
   1012         *
   1013         * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
   1014         * sets it to that value when stageUTF8 overflows.
   1015         */
   1016        if( mbcsData->omitFromU && f<=1 &&
   1017            mbcsData->utf8Max<c && c<=0xffff &&
   1018            mbcsData->utf8Max<0xfeff
   1019        ) {
   1020            mbcsData->utf8Max=0xffff;
   1021        }
   1022 
   1023        switch(f) {
   1024        case -1:
   1025            /* there was no precision/fallback indicator */
   1026            /* fall through to set the mappings */
   1027            U_FALLTHROUGH;
   1028        case 0:
   1029            /* set roundtrip mappings */
   1030            isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1031 
   1032            if(maxCharLength==1) {
   1033                isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1034            } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
   1035                isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1036            } else {
   1037                m->f|=MBCS_FROM_U_EXT_FLAG;
   1038                m->moveFlag=UCM_MOVE_TO_EXT;
   1039            }
   1040            break;
   1041        case 1:
   1042            /* set only a fallback mapping from Unicode to codepage */
   1043            if(maxCharLength==1) {
   1044                staticData->hasFromUnicodeFallback=true;
   1045                isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1046            } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
   1047                staticData->hasFromUnicodeFallback=true;
   1048                isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1049            } else {
   1050                m->f|=MBCS_FROM_U_EXT_FLAG;
   1051                m->moveFlag=UCM_MOVE_TO_EXT;
   1052            }
   1053            break;
   1054        case 2:
   1055            /* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
   1056            if(maxCharLength>1 && m->bLen==1) {
   1057                m->f|=MBCS_FROM_U_EXT_FLAG;
   1058                m->moveFlag=UCM_MOVE_TO_EXT;
   1059            }
   1060            break;
   1061        case 3:
   1062            /* set only a fallback mapping from codepage to Unicode */
   1063            staticData->hasToUnicodeFallback=true;
   1064            isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
   1065            break;
   1066        case 4:
   1067            /* move "good one-way" mappings to the extension table */
   1068            m->f|=MBCS_FROM_U_EXT_FLAG;
   1069            m->moveFlag=UCM_MOVE_TO_EXT;
   1070            break;
   1071        default:
   1072            /* will not occur because the parser checked it already */
   1073            fprintf(stderr, "error: illegal fallback indicator %d\n", f);
   1074            return false;
   1075        }
   1076    }
   1077 
   1078    MBCSPostprocess(mbcsData, staticData);
   1079 
   1080    return isOK;
   1081 }
   1082 U_CDECL_END
   1083 static UBool
   1084 transformEUC(MBCSData *mbcsData) {
   1085    uint8_t *p8;
   1086    uint32_t i, value, oldLength, old3Top;
   1087    uint8_t b;
   1088 
   1089    oldLength=mbcsData->ucm->states.maxCharLength;
   1090    if(oldLength<3) {
   1091        return false;
   1092    }
   1093 
   1094    old3Top=mbcsData->stage3Top;
   1095 
   1096    /* careful: 2-byte and 4-byte codes are stored in platform endianness! */
   1097 
   1098    /* test if all first bytes are in {0, 0x8e, 0x8f} */
   1099    p8=mbcsData->fromUBytes;
   1100 
   1101 #if !U_IS_BIG_ENDIAN
   1102    if(oldLength==4) {
   1103        p8+=3;
   1104    }
   1105 #endif
   1106 
   1107    for(i=0; i<old3Top; i+=oldLength) {
   1108        b=p8[i];
   1109        if(b!=0 && b!=0x8e && b!=0x8f) {
   1110            /* some first byte does not fit the EUC pattern, nothing to be done */
   1111            return false;
   1112        }
   1113    }
   1114    /* restore p if it was modified above */
   1115    p8=mbcsData->fromUBytes;
   1116 
   1117    /* modify outputType and adjust stage3Top */
   1118    mbcsData->ucm->states.outputType = static_cast<int8_t>(MBCS_OUTPUT_3_EUC + oldLength - 3);
   1119    mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength;
   1120 
   1121    /*
   1122     * EUC-encode all byte sequences;
   1123     * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
   1124     * p. 161 in chapter 4 "Encoding Methods"
   1125     *
   1126     * This also must reverse the byte order if the platform is little-endian!
   1127     */
   1128    if(oldLength==3) {
   1129        uint16_t* q = reinterpret_cast<uint16_t*>(p8);
   1130        for(i=0; i<old3Top; i+=oldLength) {
   1131            b=*p8;
   1132            if(b==0) {
   1133                /* short sequences are stored directly */
   1134                /* code set 0 or 1 */
   1135                (*q++) = static_cast<uint16_t>((p8[1] << 8) | p8[2]);
   1136            } else if(b==0x8e) {
   1137                /* code set 2 */
   1138                (*q++) = static_cast<uint16_t>(((p8[1] & 0x7f) << 8) | p8[2]);
   1139            } else /* b==0x8f */ {
   1140                /* code set 3 */
   1141                (*q++) = static_cast<uint16_t>((p8[1] << 8) | (p8[2] & 0x7f));
   1142            }
   1143            p8+=3;
   1144        }
   1145    } else /* oldLength==4 */ {
   1146        uint8_t *q=p8;
   1147        uint32_t* p32 = reinterpret_cast<uint32_t*>(p8);
   1148        for(i=0; i<old3Top; i+=4) {
   1149            value=(*p32++);
   1150            if(value<=0xffffff) {
   1151                /* short sequences are stored directly */
   1152                /* code set 0 or 1 */
   1153                (*q++) = static_cast<uint8_t>(value >> 16);
   1154                (*q++) = static_cast<uint8_t>(value >> 8);
   1155                (*q++) = static_cast<uint8_t>(value);
   1156            } else if(value<=0x8effffff) {
   1157                /* code set 2 */
   1158                (*q++) = static_cast<uint8_t>((value >> 16) & 0x7f);
   1159                (*q++) = static_cast<uint8_t>(value >> 8);
   1160                (*q++) = static_cast<uint8_t>(value);
   1161            } else /* first byte is 0x8f */ {
   1162                /* code set 3 */
   1163                (*q++) = static_cast<uint8_t>(value >> 16);
   1164                (*q++) = static_cast<uint8_t>((value >> 8) & 0x7f);
   1165                (*q++) = static_cast<uint8_t>(value);
   1166            }
   1167        }
   1168    }
   1169 
   1170    return true;
   1171 }
   1172 
   1173 /*
   1174 * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
   1175 * as possible. Overlapping is done on unassigned head and tail
   1176 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
   1177 * Stage 1 indexes need to be adjusted accordingly.
   1178 * This function is very similar to genprops/store.c/compactStage().
   1179 */
   1180 static void
   1181 singleCompactStage2(MBCSData *mbcsData) {
   1182    /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
   1183    uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
   1184    uint16_t i, start, prevEnd, newStart;
   1185 
   1186    /* enter the all-unassigned first stage 2 block into the map */
   1187    map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
   1188 
   1189    /* begin with the first block after the all-unassigned one */
   1190    start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
   1191    while(start<mbcsData->stage2Top) {
   1192        prevEnd = static_cast<uint16_t>(newStart - 1);
   1193 
   1194        /* find the size of the overlap */
   1195        for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
   1196 
   1197        if(i>0) {
   1198            map[start >> MBCS_STAGE_2_BLOCK_SIZE_SHIFT] = static_cast<uint16_t>(newStart - i);
   1199 
   1200            /* move the non-overlapping indexes to their new positions */
   1201            start+=i;
   1202            for (i = static_cast<uint16_t>(MBCS_STAGE_2_BLOCK_SIZE - i); i > 0; --i) {
   1203                mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
   1204            }
   1205        } else if(newStart<start) {
   1206            /* move the indexes to their new positions */
   1207            map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
   1208            for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
   1209                mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
   1210            }
   1211        } else /* no overlap && newStart==start */ {
   1212            map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
   1213            start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
   1214        }
   1215    }
   1216 
   1217    /* adjust stage2Top */
   1218    if(VERBOSE && newStart<mbcsData->stage2Top) {
   1219        printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1220               static_cast<unsigned long>(mbcsData->stage2Top), static_cast<unsigned long>(newStart),
   1221               static_cast<long>(mbcsData->stage2Top - newStart) * 2);
   1222    }
   1223    mbcsData->stage2Top=newStart;
   1224 
   1225    /* now adjust stage 1 */
   1226    for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
   1227        mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
   1228    }
   1229 }
   1230 
   1231 /* Compact stage 3 for SBCS - same algorithm as above. */
   1232 static void
   1233 singleCompactStage3(MBCSData *mbcsData) {
   1234    uint16_t* stage3 = reinterpret_cast<uint16_t*>(mbcsData->fromUBytes);
   1235 
   1236    /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
   1237    uint16_t map[0x1000];
   1238    uint16_t i, start, prevEnd, newStart;
   1239 
   1240    /* enter the all-unassigned first stage 3 block into the map */
   1241    map[0]=0;
   1242 
   1243    /* begin with the first block after the all-unassigned one */
   1244    start=newStart=16;
   1245    while(start<mbcsData->stage3Top) {
   1246        prevEnd = static_cast<uint16_t>(newStart - 1);
   1247 
   1248        /* find the size of the overlap */
   1249        for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
   1250 
   1251        if(i>0) {
   1252            map[start >> 4] = static_cast<uint16_t>(newStart - i);
   1253 
   1254            /* move the non-overlapping indexes to their new positions */
   1255            start+=i;
   1256            for (i = static_cast<uint16_t>(16 - i); i > 0; --i) {
   1257                stage3[newStart++]=stage3[start++];
   1258            }
   1259        } else if(newStart<start) {
   1260            /* move the indexes to their new positions */
   1261            map[start>>4]=newStart;
   1262            for(i=16; i>0; --i) {
   1263                stage3[newStart++]=stage3[start++];
   1264            }
   1265        } else /* no overlap && newStart==start */ {
   1266            map[start>>4]=start;
   1267            start=newStart+=16;
   1268        }
   1269    }
   1270 
   1271    /* adjust stage3Top */
   1272    if(VERBOSE && newStart<mbcsData->stage3Top) {
   1273        printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1274               static_cast<unsigned long>(mbcsData->stage3Top), static_cast<unsigned long>(newStart),
   1275               static_cast<long>(mbcsData->stage3Top - newStart) * 2);
   1276    }
   1277    mbcsData->stage3Top=newStart;
   1278 
   1279    /* now adjust stage 2 */
   1280    for(i=0; i<mbcsData->stage2Top; ++i) {
   1281        mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
   1282    }
   1283 }
   1284 
   1285 /*
   1286 * Compact stage 2 by overlapping adjacent stage 2 blocks as far
   1287 * as possible. Overlapping is done on unassigned head and tail
   1288 * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
   1289 * Stage 1 indexes need to be adjusted accordingly.
   1290 * This function is very similar to genprops/store.c/compactStage().
   1291 */
   1292 static void
   1293 compactStage2(MBCSData *mbcsData) {
   1294    /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
   1295    uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
   1296    uint16_t i, start, prevEnd, newStart;
   1297 
   1298    /* enter the all-unassigned first stage 2 block into the map */
   1299    map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
   1300 
   1301    /* begin with the first block after the all-unassigned one */
   1302    start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
   1303    while(start<mbcsData->stage2Top) {
   1304        prevEnd = static_cast<uint16_t>(newStart - 1);
   1305 
   1306        /* find the size of the overlap */
   1307        for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
   1308 
   1309        if(i>0) {
   1310            map[start >> MBCS_STAGE_2_BLOCK_SIZE_SHIFT] = static_cast<uint16_t>(newStart - i);
   1311 
   1312            /* move the non-overlapping indexes to their new positions */
   1313            start+=i;
   1314            for (i = static_cast<uint16_t>(MBCS_STAGE_2_BLOCK_SIZE - i); i > 0; --i) {
   1315                mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
   1316            }
   1317        } else if(newStart<start) {
   1318            /* move the indexes to their new positions */
   1319            map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
   1320            for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
   1321                mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
   1322            }
   1323        } else /* no overlap && newStart==start */ {
   1324            map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
   1325            start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
   1326        }
   1327    }
   1328 
   1329    /* adjust stage2Top */
   1330    if(VERBOSE && newStart<mbcsData->stage2Top) {
   1331        printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
   1332                static_cast<unsigned long>(mbcsData->stage2Top), static_cast<unsigned long>(newStart),
   1333                static_cast<long>(mbcsData->stage2Top - newStart) * 4);
   1334    }
   1335    mbcsData->stage2Top=newStart;
   1336 
   1337    /* now adjust stage 1 */
   1338    for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
   1339        mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
   1340    }
   1341 }
   1342 
   1343 static void
   1344 MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) {
   1345    UCMStates *states;
   1346    int32_t maxCharLength, stage3Width;
   1347 
   1348    states=&mbcsData->ucm->states;
   1349    stage3Width=maxCharLength=states->maxCharLength;
   1350 
   1351    ucm_optimizeStates(states,
   1352                       &mbcsData->unicodeCodeUnits,
   1353                       mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
   1354                       VERBOSE);
   1355 
   1356    /* try to compact the fromUnicode tables */
   1357    if(transformEUC(mbcsData)) {
   1358        --stage3Width;
   1359    }
   1360 
   1361    /*
   1362     * UTF-8-friendly tries are built precompacted, to cope with variable
   1363     * stage 3 allocation block sizes.
   1364     *
   1365     * Tables without precision indicators cannot be built that way,
   1366     * because if a block was overlapped with a previous one, then a smaller
   1367     * code point for the same block would not fit.
   1368     * Therefore, such tables are not marked UTF-8-friendly and must be
   1369     * compacted after all mappings are entered.
   1370     */
   1371    if(!mbcsData->utf8Friendly) {
   1372        if(maxCharLength==1) {
   1373            singleCompactStage3(mbcsData);
   1374            singleCompactStage2(mbcsData);
   1375        } else {
   1376            compactStage2(mbcsData);
   1377        }
   1378    }
   1379 
   1380    if(VERBOSE) {
   1381        /*uint32_t c, i1, i2, i2Limit, i3;*/
   1382 
   1383        printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n",
   1384               maxCharLength==1 ? "16" : "32",
   1385               static_cast<unsigned long>(mbcsData->stage2Top),
   1386               static_cast<unsigned long>(mbcsData->stage2Top));
   1387        printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n",
   1388               static_cast<int>(stage3Width),
   1389               static_cast<unsigned long>(mbcsData->stage3Top) / stage3Width,
   1390               static_cast<unsigned long>(mbcsData->stage3Top) / stage3Width);
   1391 #if 0
   1392        c=0;
   1393        for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) {
   1394            i2=mbcsData->stage1[i1];
   1395            if(i2==0) {
   1396                c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE;
   1397                continue;
   1398            }
   1399            for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) {
   1400                if(maxCharLength==1) {
   1401                    i3=mbcsData->stage2Single[i2];
   1402                } else {
   1403                    i3=(uint16_t)mbcsData->stage2[i2];
   1404                }
   1405                if(i3==0) {
   1406                    c+=MBCS_STAGE_3_BLOCK_SIZE;
   1407                    continue;
   1408                }
   1409                printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n",
   1410                       (unsigned long)c,
   1411                       (unsigned long)i1,
   1412                       (unsigned long)i2,
   1413                       (unsigned long)i3);
   1414                c+=MBCS_STAGE_3_BLOCK_SIZE;
   1415            }
   1416        }
   1417 #endif
   1418    }
   1419 }
   1420 
   1421 U_CDECL_BEGIN
   1422 static uint32_t
   1423 MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
   1424          UNewDataMemory *pData, int32_t tableType) {
   1425    MBCSData *mbcsData=(MBCSData *)cnvData;
   1426    uint32_t stage2Start, stage2Length;
   1427    uint32_t top, stageUTF8Length=0;
   1428    int32_t i, stage1Top;
   1429    uint32_t headerLength;
   1430 
   1431    _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER;
   1432 
   1433    stage2Length=mbcsData->stage2Top;
   1434    if(mbcsData->omitFromU) {
   1435        /* find how much of stage2 can be omitted */
   1436        int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
   1437        uint32_t st2=0; /*initialized it to avoid compiler warnings */
   1438 
   1439        i=utf8Limit>>MBCS_STAGE_1_SHIFT;
   1440        if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
   1441            /* utf8Limit is in the middle of an existing stage 2 block */
   1442            stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
   1443        } else {
   1444            /* find the last stage2 block with mappings before utf8Limit */
   1445            while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
   1446            /* stage2 up to the end of this block corresponds to stageUTF8 */
   1447            stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
   1448        }
   1449        header.options|=MBCS_OPT_NO_FROM_U;
   1450        header.fullStage2Length=stage2Length;
   1451        stage2Length-=stage2Start;
   1452        if(VERBOSE) {
   1453            printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
   1454                    (unsigned long)stage2Start,
   1455                    (unsigned long)mbcsData->stage2Top,
   1456                    (unsigned long)mbcsData->stage3Top);
   1457            printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top);
   1458        }
   1459    } else {
   1460        stage2Start=0;
   1461    }
   1462 
   1463    if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
   1464        stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
   1465    } else {
   1466        stage1Top=0x40; /* 0x40==64 */
   1467    }
   1468 
   1469    /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
   1470    if(mbcsData->ucm->states.maxCharLength==1) {
   1471        for(i=0; i<stage1Top; ++i) {
   1472            mbcsData->stage1[i]+=(uint16_t)stage1Top;
   1473        }
   1474 
   1475        /* stage2Top/Length have counted 16-bit results, now we need to count bytes */
   1476        /* also round up to a multiple of 4 bytes */
   1477        stage2Length=(stage2Length*2+1)&~1;
   1478 
   1479        /* stage3Top has counted 16-bit results, now we need to count bytes */
   1480        mbcsData->stage3Top*=2;
   1481 
   1482        if(mbcsData->utf8Friendly) {
   1483            header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
   1484        }
   1485    } else {
   1486        for(i=0; i<stage1Top; ++i) {
   1487            mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
   1488        }
   1489 
   1490        /* stage2Top/Length have counted 32-bit results, now we need to count bytes */
   1491        stage2Length*=4;
   1492        /* leave stage2Start counting 32-bit units */
   1493 
   1494        if(mbcsData->utf8Friendly) {
   1495            stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
   1496            header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
   1497        }
   1498 
   1499        /* stage3Top has already counted bytes */
   1500    }
   1501 
   1502    /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
   1503    mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
   1504 
   1505    /* fill the header */
   1506    if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
   1507        header.version[0]=5;
   1508        if(header.options&MBCS_OPT_NO_FROM_U) {
   1509            headerLength=10;  /* include fullStage2Length */
   1510        } else {
   1511            headerLength=MBCS_HEADER_V5_MIN_LENGTH;  /* 9 */
   1512        }
   1513    } else {
   1514        header.version[0]=4;
   1515        headerLength=MBCS_HEADER_V4_LENGTH;  /* 8 */
   1516    }
   1517    header.version[1]=4;
   1518    /* header.version[2] set above for utf8Friendly data */
   1519 
   1520    header.options |= headerLength;
   1521 
   1522    header.countStates=mbcsData->ucm->states.countStates;
   1523    header.countToUFallbacks=mbcsData->countToUFallbacks;
   1524 
   1525    header.offsetToUCodeUnits=
   1526        headerLength*4+
   1527        mbcsData->ucm->states.countStates*1024+
   1528        mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
   1529    header.offsetFromUTable=
   1530        header.offsetToUCodeUnits+
   1531        mbcsData->ucm->states.countToUCodeUnits*2;
   1532    header.offsetFromUBytes=
   1533        header.offsetFromUTable+
   1534        stage1Top*2+
   1535        stage2Length;
   1536    header.fromUBytesLength=mbcsData->stage3Top;
   1537 
   1538    top=header.offsetFromUBytes+stageUTF8Length*2;
   1539    if(!(header.options&MBCS_OPT_NO_FROM_U)) {
   1540        top+=header.fromUBytesLength;
   1541    }
   1542 
   1543    header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
   1544 
   1545    if(tableType&TABLE_EXT) {
   1546        if(top>0xffffff) {
   1547            fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top);
   1548            return 0;
   1549        }
   1550 
   1551        header.flags|=top<<8;
   1552    }
   1553 
   1554    /* write the MBCS data */
   1555    udata_writeBlock(pData, &header, headerLength*4);
   1556    udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
   1557    udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
   1558    udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
   1559    udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
   1560    if(mbcsData->ucm->states.maxCharLength==1) {
   1561        udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
   1562    } else {
   1563        udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
   1564    }
   1565    if(!(header.options&MBCS_OPT_NO_FROM_U)) {
   1566        udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
   1567    }
   1568 
   1569    if(stageUTF8Length>0) {
   1570        udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
   1571    }
   1572 
   1573    /* return the number of bytes that should have been written */
   1574    return top;
   1575 }
   1576 U_CDECL_END