tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utrie.cpp (38831B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2001oct20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is a common implementation of a "folded" trie.
     19 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     20 *   Unicode code points (0..0x10ffff).
     21 */
     22 
     23 #ifdef UTRIE_DEBUG
     24 #   include <stdio.h>
     25 #endif
     26 
     27 #include "unicode/utypes.h"
     28 #include "cmemory.h"
     29 #include "utrie.h"
     30 
     31 /* miscellaneous ------------------------------------------------------------ */
     32 
     33 #undef ABS
     34 #define ABS(x) ((x)>=0 ? (x) : -(x))
     35 
     36 static inline UBool
     37 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
     38    while(length>0 && *s==*t) {
     39        ++s;
     40        ++t;
     41        --length;
     42    }
     43    return length == 0;
     44 }
     45 
     46 /* Building a trie ----------------------------------------------------------*/
     47 
     48 U_CAPI UNewTrie * U_EXPORT2
     49 utrie_open(UNewTrie *fillIn,
     50           uint32_t *aliasData, int32_t maxDataLength,
     51           uint32_t initialValue, uint32_t leadUnitValue,
     52           UBool latin1Linear) {
     53    UNewTrie *trie;
     54    int32_t i, j;
     55 
     56    if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
     57        (latin1Linear && maxDataLength<1024)
     58    ) {
     59        return nullptr;
     60    }
     61 
     62    if(fillIn!=nullptr) {
     63        trie=fillIn;
     64    } else {
     65        trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
     66        if(trie==nullptr) {
     67            return nullptr;
     68        }
     69    }
     70    uprv_memset(trie, 0, sizeof(UNewTrie));
     71    trie->isAllocated = fillIn == nullptr;
     72 
     73    if(aliasData!=nullptr) {
     74        trie->data=aliasData;
     75        trie->isDataAllocated=false;
     76    } else {
     77        trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
     78        if(trie->data==nullptr) {
     79            uprv_free(trie);
     80            return nullptr;
     81        }
     82        trie->isDataAllocated=true;
     83    }
     84 
     85    /* preallocate and reset the first data block (block index 0) */
     86    j=UTRIE_DATA_BLOCK_LENGTH;
     87 
     88    if(latin1Linear) {
     89        /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
     90        /* made sure above that maxDataLength>=1024 */
     91 
     92        /* set indexes to point to consecutive data blocks */
     93        i=0;
     94        do {
     95            /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
     96            trie->index[i++]=j;
     97            j+=UTRIE_DATA_BLOCK_LENGTH;
     98        } while(i<(256>>UTRIE_SHIFT));
     99    }
    100 
    101    /* reset the initially allocated blocks to the initial value */
    102    trie->dataLength=j;
    103    while(j>0) {
    104        trie->data[--j]=initialValue;
    105    }
    106 
    107    trie->leadUnitValue=leadUnitValue;
    108    trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
    109    trie->dataCapacity=maxDataLength;
    110    trie->isLatin1Linear=latin1Linear;
    111    trie->isCompacted=false;
    112    return trie;
    113 }
    114 
    115 U_CAPI UNewTrie * U_EXPORT2
    116 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
    117    UNewTrie *trie;
    118    UBool isDataAllocated;
    119 
    120    /* do not clone if other is not valid or already compacted */
    121    if(other==nullptr || other->data==nullptr || other->isCompacted) {
    122        return nullptr;
    123    }
    124 
    125    /* clone data */
    126    if(aliasData!=nullptr && aliasDataCapacity>=other->dataCapacity) {
    127        isDataAllocated=false;
    128    } else {
    129        aliasDataCapacity=other->dataCapacity;
    130        aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
    131        if(aliasData==nullptr) {
    132            return nullptr;
    133        }
    134        isDataAllocated=true;
    135    }
    136 
    137    trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
    138                    other->data[0], other->leadUnitValue,
    139                    other->isLatin1Linear);
    140    if(trie==nullptr) {
    141        uprv_free(aliasData);
    142    } else {
    143        uprv_memcpy(trie->index, other->index, sizeof(trie->index));
    144        uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
    145        trie->dataLength=other->dataLength;
    146        trie->isDataAllocated=isDataAllocated;
    147    }
    148 
    149    return trie;
    150 }
    151 
    152 U_CAPI void U_EXPORT2
    153 utrie_close(UNewTrie *trie) {
    154    if(trie!=nullptr) {
    155        if(trie->isDataAllocated) {
    156            uprv_free(trie->data);
    157            trie->data=nullptr;
    158        }
    159        if(trie->isAllocated) {
    160            uprv_free(trie);
    161        }
    162    }
    163 }
    164 
    165 U_CAPI uint32_t * U_EXPORT2
    166 utrie_getData(UNewTrie *trie, int32_t *pLength) {
    167    if(trie==nullptr || pLength==nullptr) {
    168        return nullptr;
    169    }
    170 
    171    *pLength=trie->dataLength;
    172    return trie->data;
    173 }
    174 
    175 static int32_t
    176 utrie_allocDataBlock(UNewTrie *trie) {
    177    int32_t newBlock, newTop;
    178 
    179    newBlock=trie->dataLength;
    180    newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
    181    if(newTop>trie->dataCapacity) {
    182        /* out of memory in the data array */
    183        return -1;
    184    }
    185    trie->dataLength=newTop;
    186    return newBlock;
    187 }
    188 
    189 /**
    190 * No error checking for illegal arguments.
    191 *
    192 * @return -1 if no new data block available (out of memory in data array)
    193 * @internal
    194 */
    195 static int32_t
    196 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
    197    int32_t indexValue, newBlock;
    198 
    199    c>>=UTRIE_SHIFT;
    200    indexValue=trie->index[c];
    201    if(indexValue>0) {
    202        return indexValue;
    203    }
    204 
    205    /* allocate a new data block */
    206    newBlock=utrie_allocDataBlock(trie);
    207    if(newBlock<0) {
    208        /* out of memory in the data array */
    209        return -1;
    210    }
    211    trie->index[c]=newBlock;
    212 
    213    /* copy-on-write for a block from a setRange() */
    214    uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
    215    return newBlock;
    216 }
    217 
    218 /**
    219 * @return true if the value was successfully set
    220 */
    221 U_CAPI UBool U_EXPORT2
    222 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
    223    int32_t block;
    224 
    225    /* valid, uncompacted trie and valid c? */
    226    if(trie==nullptr || trie->isCompacted || (uint32_t)c>0x10ffff) {
    227        return false;
    228    }
    229 
    230    block=utrie_getDataBlock(trie, c);
    231    if(block<0) {
    232        return false;
    233    }
    234 
    235    trie->data[block+(c&UTRIE_MASK)]=value;
    236    return true;
    237 }
    238 
    239 U_CAPI uint32_t U_EXPORT2
    240 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
    241    int32_t block;
    242 
    243    /* valid, uncompacted trie and valid c? */
    244    if(trie==nullptr || trie->isCompacted || (uint32_t)c>0x10ffff) {
    245        if(pInBlockZero!=nullptr) {
    246            *pInBlockZero=true;
    247        }
    248        return 0;
    249    }
    250 
    251    block=trie->index[c>>UTRIE_SHIFT];
    252    if(pInBlockZero!=nullptr) {
    253        *pInBlockZero = block == 0;
    254    }
    255 
    256    return trie->data[ABS(block)+(c&UTRIE_MASK)];
    257 }
    258 
    259 /**
    260 * @internal
    261 */
    262 static void
    263 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
    264                uint32_t value, uint32_t initialValue, UBool overwrite) {
    265    uint32_t *pLimit;
    266 
    267    pLimit=block+limit;
    268    block+=start;
    269    if(overwrite) {
    270        while(block<pLimit) {
    271            *block++=value;
    272        }
    273    } else {
    274        while(block<pLimit) {
    275            if(*block==initialValue) {
    276                *block=value;
    277            }
    278            ++block;
    279        }
    280    }
    281 }
    282 
    283 U_CAPI UBool U_EXPORT2
    284 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
    285    /*
    286     * repeat value in [start..limit[
    287     * mark index values for repeat-data blocks by setting bit 31 of the index values
    288     * fill around existing values if any, if(overwrite)
    289     */
    290    uint32_t initialValue;
    291    int32_t block, rest, repeatBlock;
    292 
    293    /* valid, uncompacted trie and valid indexes? */
    294    if( trie==nullptr || trie->isCompacted ||
    295        (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
    296    ) {
    297        return false;
    298    }
    299    if(start==limit) {
    300        return true; /* nothing to do */
    301    }
    302 
    303    initialValue=trie->data[0];
    304    if(start&UTRIE_MASK) {
    305        UChar32 nextStart;
    306 
    307        /* set partial block at [start..following block boundary[ */
    308        block=utrie_getDataBlock(trie, start);
    309        if(block<0) {
    310            return false;
    311        }
    312 
    313        nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
    314        if(nextStart<=limit) {
    315            utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
    316                            value, initialValue, overwrite);
    317            start=nextStart;
    318        } else {
    319            utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
    320                            value, initialValue, overwrite);
    321            return true;
    322        }
    323    }
    324 
    325    /* number of positions in the last, partial block */
    326    rest=limit&UTRIE_MASK;
    327 
    328    /* round down limit to a block boundary */
    329    limit&=~UTRIE_MASK;
    330 
    331    /* iterate over all-value blocks */
    332    if(value==initialValue) {
    333        repeatBlock=0;
    334    } else {
    335        repeatBlock=-1;
    336    }
    337    while(start<limit) {
    338        /* get index value */
    339        block=trie->index[start>>UTRIE_SHIFT];
    340        if(block>0) {
    341            /* already allocated, fill in value */
    342            utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
    343        } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
    344            /* set the repeatBlock instead of the current block 0 or range block */
    345            if(repeatBlock>=0) {
    346                trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    347            } else {
    348                /* create and set and fill the repeatBlock */
    349                repeatBlock=utrie_getDataBlock(trie, start);
    350                if(repeatBlock<0) {
    351                    return false;
    352                }
    353 
    354                /* set the negative block number to indicate that it is a repeat block */
    355                trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
    356                utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, true);
    357            }
    358        }
    359 
    360        start+=UTRIE_DATA_BLOCK_LENGTH;
    361    }
    362 
    363    if(rest>0) {
    364        /* set partial block at [last block boundary..limit[ */
    365        block=utrie_getDataBlock(trie, start);
    366        if(block<0) {
    367            return false;
    368        }
    369 
    370        utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
    371    }
    372 
    373    return true;
    374 }
    375 
    376 static int32_t
    377 _findSameIndexBlock(const int32_t *idx, int32_t indexLength,
    378                    int32_t otherBlock) {
    379    int32_t block, i;
    380 
    381    for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
    382        for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
    383            if(idx[block+i]!=idx[otherBlock+i]) {
    384                break;
    385            }
    386        }
    387        if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
    388            return block;
    389        }
    390    }
    391    return indexLength;
    392 }
    393 
    394 /*
    395 * Fold the normalization data for supplementary code points into
    396 * a compact area on top of the BMP-part of the trie index,
    397 * with the lead surrogates indexing this compact area.
    398 *
    399 * Duplicate the index values for lead surrogates:
    400 * From inside the BMP area, where some may be overridden with folded values,
    401 * to just after the BMP area, where they can be retrieved for
    402 * code point lookups.
    403 */
    404 static void
    405 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
    406    int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
    407    int32_t *idx;
    408    uint32_t value;
    409    UChar32 c;
    410    int32_t indexLength, block;
    411 #ifdef UTRIE_DEBUG
    412    int countLeadCUWithData=0;
    413 #endif
    414 
    415    idx=trie->index;
    416 
    417    /* copy the lead surrogate indexes into a temporary array */
    418    uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
    419 
    420    /*
    421     * set all values for lead surrogate code *units* to leadUnitValue
    422     * so that, by default, runtime lookups will find no data for associated
    423     * supplementary code points, unless there is data for such code points
    424     * which will result in a non-zero folding value below that is set for
    425     * the respective lead units
    426     *
    427     * the above saved the indexes for surrogate code *points*
    428     * fill the indexes with simplified code from utrie_setRange32()
    429     */
    430    if(trie->leadUnitValue==trie->data[0]) {
    431        block=0; /* leadUnitValue==initialValue, use all-initial-value block */
    432    } else {
    433        /* create and fill the repeatBlock */
    434        block=utrie_allocDataBlock(trie);
    435        if(block<0) {
    436            /* data table overflow */
    437            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    438            return;
    439        }
    440        utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], true);
    441        block=-block; /* negative block number to indicate that it is a repeat block */
    442    }
    443    for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
    444        trie->index[c]=block;
    445    }
    446 
    447    /*
    448     * Fold significant index values into the area just after the BMP indexes.
    449     * In case the first lead surrogate has significant data,
    450     * its index block must be used first (in which case the folding is a no-op).
    451     * Later all folded index blocks are moved up one to insert the copied
    452     * lead surrogate indexes.
    453     */
    454    indexLength=UTRIE_BMP_INDEX_LENGTH;
    455 
    456    /* search for any index (stage 1) entries for supplementary code points */
    457    for(c=0x10000; c<0x110000;) {
    458        if(idx[c>>UTRIE_SHIFT]!=0) {
    459            /* there is data, treat the full block for a lead surrogate */
    460            c&=~0x3ff;
    461 
    462 #ifdef UTRIE_DEBUG
    463            ++countLeadCUWithData;
    464            /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
    465 #endif
    466 
    467            /* is there an identical index block? */
    468            block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
    469 
    470            /*
    471             * get a folded value for [c..c+0x400[ and,
    472             * if different from the value for the lead surrogate code point,
    473             * set it for the lead surrogate code unit
    474             */
    475            value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
    476            if(value!=utrie_get32(trie, U16_LEAD(c), nullptr)) {
    477                if(!utrie_set32(trie, U16_LEAD(c), value)) {
    478                    /* data table overflow */
    479                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    480                    return;
    481                }
    482 
    483                /* if we did not find an identical index block... */
    484                if(block==indexLength) {
    485                    /* move the actual index (stage 1) entries from the supplementary position to the new one */
    486                    uprv_memmove(idx+indexLength,
    487                                 idx+(c>>UTRIE_SHIFT),
    488                                 4*UTRIE_SURROGATE_BLOCK_COUNT);
    489                    indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    490                }
    491            }
    492            c+=0x400;
    493        } else {
    494            c+=UTRIE_DATA_BLOCK_LENGTH;
    495        }
    496    }
    497 #ifdef UTRIE_DEBUG
    498    if(countLeadCUWithData>0) {
    499        printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
    500    }
    501 #endif
    502 
    503    /*
    504     * index array overflow?
    505     * This is to guarantee that a folding offset is of the form
    506     * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
    507     * If the index is too large, then n>=1024 and more than 10 bits are necessary.
    508     *
    509     * In fact, it can only ever become n==1024 with completely unfoldable data and
    510     * the additional block of duplicated values for lead surrogates.
    511     */
    512    if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
    513        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    514        return;
    515    }
    516 
    517    /*
    518     * make space for the lead surrogate index block and
    519     * insert it between the BMP indexes and the folded ones
    520     */
    521    uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
    522                 idx+UTRIE_BMP_INDEX_LENGTH,
    523                 4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
    524    uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
    525                leadIndexes,
    526                4*UTRIE_SURROGATE_BLOCK_COUNT);
    527    indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
    528 
    529 #ifdef UTRIE_DEBUG
    530    printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
    531           UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
    532 #endif
    533 
    534    trie->indexLength=indexLength;
    535 }
    536 
    537 /*
    538 * Set a value in the trie index map to indicate which data block
    539 * is referenced and which one is not.
    540 * utrie_compact() will remove data blocks that are not used at all.
    541 * Set
    542 * - 0 if it is used
    543 * - -1 if it is not used
    544 */
    545 static void
    546 _findUnusedBlocks(UNewTrie *trie) {
    547    int32_t i;
    548 
    549    /* fill the entire map with "not used" */
    550    uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
    551 
    552    /* mark each block that _is_ used with 0 */
    553    for(i=0; i<trie->indexLength; ++i) {
    554        trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
    555    }
    556 
    557    /* never move the all-initial-value block 0 */
    558    trie->map[0]=0;
    559 }
    560 
    561 static int32_t
    562 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
    563                   int32_t otherBlock, int32_t step) {
    564    int32_t block;
    565 
    566    /* ensure that we do not even partially get past dataLength */
    567    dataLength-=UTRIE_DATA_BLOCK_LENGTH;
    568 
    569    for(block=0; block<=dataLength; block+=step) {
    570        if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
    571            return block;
    572        }
    573    }
    574    return -1;
    575 }
    576 
    577 /*
    578 * Compact a folded build-time trie.
    579 *
    580 * The compaction
    581 * - removes blocks that are identical with earlier ones
    582 * - overlaps adjacent blocks as much as possible (if overlap==true)
    583 * - moves blocks in steps of the data granularity
    584 * - moves and overlaps blocks that overlap with multiple values in the overlap region
    585 *
    586 * It does not
    587 * - try to move and overlap blocks that are not already adjacent
    588 */
    589 static void
    590 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
    591    int32_t i, start, newStart, overlapStart;
    592 
    593    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    594        return;
    595    }
    596 
    597    /* valid, uncompacted trie? */
    598    if(trie==nullptr) {
    599        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    600        return;
    601    }
    602    if(trie->isCompacted) {
    603        return; /* nothing left to do */
    604    }
    605 
    606    /* compaction */
    607 
    608    /* initialize the index map with "block is used/unused" flags */
    609    _findUnusedBlocks(trie);
    610 
    611    /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
    612    if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
    613        overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
    614    } else {
    615        overlapStart=UTRIE_DATA_BLOCK_LENGTH;
    616    }
    617 
    618    newStart=UTRIE_DATA_BLOCK_LENGTH;
    619    for(start=newStart; start<trie->dataLength;) {
    620        /*
    621         * start: index of first entry of current block
    622         * newStart: index where the current block is to be moved
    623         *           (right after current end of already-compacted data)
    624         */
    625 
    626        /* skip blocks that are not used */
    627        if(trie->map[start>>UTRIE_SHIFT]<0) {
    628            /* advance start to the next block */
    629            start+=UTRIE_DATA_BLOCK_LENGTH;
    630 
    631            /* leave newStart with the previous block! */
    632            continue;
    633        }
    634 
    635        /* search for an identical block */
    636        if( start>=overlapStart &&
    637            (i=_findSameDataBlock(trie->data, newStart, start,
    638                            overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
    639             >=0
    640        ) {
    641            /* found an identical block, set the other block's index value for the current block */
    642            trie->map[start>>UTRIE_SHIFT]=i;
    643 
    644            /* advance start to the next block */
    645            start+=UTRIE_DATA_BLOCK_LENGTH;
    646 
    647            /* leave newStart with the previous block! */
    648            continue;
    649        }
    650 
    651        /* see if the beginning of this block can be overlapped with the end of the previous block */
    652        if(overlap && start>=overlapStart) {
    653            /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
    654            for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
    655                i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
    656                i-=UTRIE_DATA_GRANULARITY) {}
    657        } else {
    658            i=0;
    659        }
    660 
    661        if(i>0) {
    662            /* some overlap */
    663            trie->map[start>>UTRIE_SHIFT]=newStart-i;
    664 
    665            /* move the non-overlapping indexes to their new positions */
    666            start+=i;
    667            for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
    668                trie->data[newStart++]=trie->data[start++];
    669            }
    670        } else if(newStart<start) {
    671            /* no overlap, just move the indexes to their new positions */
    672            trie->map[start>>UTRIE_SHIFT]=newStart;
    673            for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
    674                trie->data[newStart++]=trie->data[start++];
    675            }
    676        } else /* no overlap && newStart==start */ {
    677            trie->map[start>>UTRIE_SHIFT]=start;
    678            newStart+=UTRIE_DATA_BLOCK_LENGTH;
    679            start=newStart;
    680        }
    681    }
    682 
    683    /* now adjust the index (stage 1) table */
    684    for(i=0; i<trie->indexLength; ++i) {
    685        trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
    686    }
    687 
    688 #ifdef UTRIE_DEBUG
    689    /* we saved some space */
    690    printf("compacting trie: count of 32-bit words %lu->%lu\n",
    691            (long)trie->dataLength, (long)newStart);
    692 #endif
    693 
    694    trie->dataLength=newStart;
    695 }
    696 
    697 /* serialization ------------------------------------------------------------ */
    698 
    699 /*
    700 * Default function for the folding value:
    701 * Just store the offset (16 bits) if there is any non-initial-value entry.
    702 *
    703 * The offset parameter is never 0.
    704 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
    705 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
    706 * which fits into 16-bit trie values;
    707 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
    708 *
    709 * Theoretically, it would be safer for all possible UTRIE_SHIFT including
    710 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
    711 * which would always result in a value of 0x40..0x43f
    712 * (start/end 1k blocks of supplementary Unicode code points).
    713 * However, this would be uglier, and would not work for some existing
    714 * binary data file formats.
    715 *
    716 * Also, we do not plan to change UTRIE_SHIFT because it would change binary
    717 * data file formats, and we would probably not make it smaller because of
    718 * the then even larger BMP index length even for empty tries.
    719 */
    720 static uint32_t U_CALLCONV
    721 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
    722    uint32_t value, initialValue;
    723    UChar32 limit;
    724    UBool inBlockZero;
    725 
    726    initialValue=trie->data[0];
    727    limit=start+0x400;
    728    while(start<limit) {
    729        value=utrie_get32(trie, start, &inBlockZero);
    730        if(inBlockZero) {
    731            start+=UTRIE_DATA_BLOCK_LENGTH;
    732        } else if(value!=initialValue) {
    733            return static_cast<uint32_t>(offset);
    734        } else {
    735            ++start;
    736        }
    737    }
    738    return 0;
    739 }
    740 
    741 U_CAPI int32_t U_EXPORT2
    742 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
    743                UNewTrieGetFoldedValue *getFoldedValue,
    744                UBool reduceTo16Bits,
    745                UErrorCode *pErrorCode) {
    746    UTrieHeader *header;
    747    uint32_t *p;
    748    uint16_t *dest16;
    749    int32_t i, length;
    750    uint8_t* data = nullptr;
    751 
    752    /* argument check */
    753    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    754        return 0;
    755    }
    756 
    757    if(trie==nullptr || capacity<0 || (capacity>0 && dt==nullptr)) {
    758        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    759        return 0;
    760    }
    761    if(getFoldedValue==nullptr) {
    762        getFoldedValue=defaultGetFoldedValue;
    763    }
    764 
    765    data = (uint8_t*)dt;
    766    /* fold and compact if necessary, also checks that indexLength is within limits */
    767    if(!trie->isCompacted) {
    768        /* compact once without overlap to improve folding */
    769        utrie_compact(trie, false, pErrorCode);
    770 
    771        /* fold the supplementary part of the index array */
    772        utrie_fold(trie, getFoldedValue, pErrorCode);
    773 
    774        /* compact again with overlap for minimum data array length */
    775        utrie_compact(trie, true, pErrorCode);
    776 
    777        trie->isCompacted=true;
    778        if(U_FAILURE(*pErrorCode)) {
    779            return 0;
    780        }
    781    }
    782 
    783    /* is dataLength within limits? */
    784    if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
    785        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    786    }
    787 
    788    length=sizeof(UTrieHeader)+2*trie->indexLength;
    789    if(reduceTo16Bits) {
    790        length+=2*trie->dataLength;
    791    } else {
    792        length+=4*trie->dataLength;
    793    }
    794 
    795    if(length>capacity) {
    796        return length; /* preflighting */
    797    }
    798 
    799 #ifdef UTRIE_DEBUG
    800    printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n",
    801           (long)trie->indexLength, (long)trie->dataLength, (long)length);
    802 #endif
    803 
    804    /* set the header fields */
    805    header=(UTrieHeader *)data;
    806    data+=sizeof(UTrieHeader);
    807 
    808    header->signature=0x54726965; /* "Trie" */
    809    header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
    810 
    811    if(!reduceTo16Bits) {
    812        header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
    813    }
    814    if(trie->isLatin1Linear) {
    815        header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
    816    }
    817 
    818    header->indexLength=trie->indexLength;
    819    header->dataLength=trie->dataLength;
    820 
    821    /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
    822    if(reduceTo16Bits) {
    823        /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
    824        p=(uint32_t *)trie->index;
    825        dest16=(uint16_t *)data;
    826        for(i=trie->indexLength; i>0; --i) {
    827            *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
    828        }
    829 
    830        /* write 16-bit data values */
    831        p=trie->data;
    832        for(i=trie->dataLength; i>0; --i) {
    833            *dest16++=(uint16_t)*p++;
    834        }
    835    } else {
    836        /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
    837        p=(uint32_t *)trie->index;
    838        dest16=(uint16_t *)data;
    839        for(i=trie->indexLength; i>0; --i) {
    840            *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
    841        }
    842 
    843        /* write 32-bit data values */
    844        uprv_memcpy(dest16, trie->data, 4*(size_t)trie->dataLength);
    845    }
    846 
    847    return length;
    848 }
    849 
    850 /* inverse to defaultGetFoldedValue() */
    851 U_CAPI int32_t U_EXPORT2
    852 utrie_defaultGetFoldingOffset(uint32_t data) {
    853    return (int32_t)data;
    854 }
    855 
    856 U_CAPI int32_t U_EXPORT2
    857 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
    858    const UTrieHeader *header;
    859    const uint16_t *p16;
    860    uint32_t options;
    861 
    862    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    863        return -1;
    864    }
    865 
    866    /* enough data for a trie header? */
    867    if(length<(int32_t)sizeof(UTrieHeader)) {
    868        *pErrorCode=U_INVALID_FORMAT_ERROR;
    869        return -1;
    870    }
    871 
    872    /* check the signature */
    873    header=(const UTrieHeader *)data;
    874    if(header->signature!=0x54726965) {
    875        *pErrorCode=U_INVALID_FORMAT_ERROR;
    876        return -1;
    877    }
    878 
    879    /* get the options and check the shift values */
    880    options=header->options;
    881    if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
    882        ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
    883    ) {
    884        *pErrorCode=U_INVALID_FORMAT_ERROR;
    885        return -1;
    886    }
    887    trie->isLatin1Linear = (options & UTRIE_OPTIONS_LATIN1_IS_LINEAR) != 0;
    888 
    889    /* get the length values */
    890    trie->indexLength=header->indexLength;
    891    trie->dataLength=header->dataLength;
    892 
    893    length-=(int32_t)sizeof(UTrieHeader);
    894 
    895    /* enough data for the index? */
    896    if(length<2*trie->indexLength) {
    897        *pErrorCode=U_INVALID_FORMAT_ERROR;
    898        return -1;
    899    }
    900    p16=(const uint16_t *)(header+1);
    901    trie->index=p16;
    902    p16+=trie->indexLength;
    903    length-=2*trie->indexLength;
    904 
    905    /* get the data */
    906    if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
    907        if(length<4*trie->dataLength) {
    908            *pErrorCode=U_INVALID_FORMAT_ERROR;
    909            return -1;
    910        }
    911        trie->data32=(const uint32_t *)p16;
    912        trie->initialValue=trie->data32[0];
    913        length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
    914    } else {
    915        if(length<2*trie->dataLength) {
    916            *pErrorCode=U_INVALID_FORMAT_ERROR;
    917            return -1;
    918        }
    919 
    920        /* the "data16" data is used via the index pointer */
    921        trie->data32=nullptr;
    922        trie->initialValue=trie->index[trie->indexLength];
    923        length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
    924    }
    925 
    926    trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
    927 
    928    return length;
    929 }
    930 
    931 U_CAPI int32_t U_EXPORT2
    932 utrie_unserializeDummy(UTrie *trie,
    933                       void *data, int32_t length,
    934                       uint32_t initialValue, uint32_t leadUnitValue,
    935                       UBool make16BitTrie,
    936                       UErrorCode *pErrorCode) {
    937    uint16_t *p16;
    938    int32_t actualLength, latin1Length, i, limit;
    939    uint16_t block;
    940 
    941    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
    942        return -1;
    943    }
    944 
    945    /* calculate the actual size of the dummy trie data */
    946 
    947    /* max(Latin-1, block 0) */
    948    latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/
    949 
    950    trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
    951    trie->dataLength=latin1Length;
    952    if(leadUnitValue!=initialValue) {
    953        trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
    954    }
    955 
    956    actualLength=trie->indexLength*2;
    957    if(make16BitTrie) {
    958        actualLength+=trie->dataLength*2;
    959    } else {
    960        actualLength+=trie->dataLength*4;
    961    }
    962 
    963    /* enough space for the dummy trie? */
    964    if(length<actualLength) {
    965        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    966        return actualLength;
    967    }
    968 
    969    trie->isLatin1Linear=true;
    970    trie->initialValue=initialValue;
    971 
    972    /* fill the index and data arrays */
    973    p16=(uint16_t *)data;
    974    trie->index=p16;
    975 
    976    if(make16BitTrie) {
    977        /* indexes to block 0 */
    978        block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
    979        limit=trie->indexLength;
    980        for(i=0; i<limit; ++i) {
    981            p16[i]=block;
    982        }
    983 
    984        if(leadUnitValue!=initialValue) {
    985            /* indexes for lead surrogate code units to the block after Latin-1 */
    986            block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
    987            i=0xd800>>UTRIE_SHIFT;
    988            limit=0xdc00>>UTRIE_SHIFT;
    989            for(; i<limit; ++i) {
    990                p16[i]=block;
    991            }
    992        }
    993 
    994        trie->data32=nullptr;
    995 
    996        /* Latin-1 data */
    997        p16+=trie->indexLength;
    998        for(i=0; i<latin1Length; ++i) {
    999            p16[i]=(uint16_t)initialValue;
   1000        }
   1001 
   1002        /* data for lead surrogate code units */
   1003        if(leadUnitValue!=initialValue) {
   1004            limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1005            for(/* i=latin1Length */; i<limit; ++i) {
   1006                p16[i]=(uint16_t)leadUnitValue;
   1007            }
   1008        }
   1009    } else {
   1010        uint32_t *p32;
   1011 
   1012        /* indexes to block 0 */
   1013        uprv_memset(p16, 0, trie->indexLength*2);
   1014 
   1015        if(leadUnitValue!=initialValue) {
   1016            /* indexes for lead surrogate code units to the block after Latin-1 */
   1017            block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
   1018            i=0xd800>>UTRIE_SHIFT;
   1019            limit=0xdc00>>UTRIE_SHIFT;
   1020            for(; i<limit; ++i) {
   1021                p16[i]=block;
   1022            }
   1023        }
   1024 
   1025        trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
   1026 
   1027        /* Latin-1 data */
   1028        for(i=0; i<latin1Length; ++i) {
   1029            p32[i]=initialValue;
   1030        }
   1031 
   1032        /* data for lead surrogate code units */
   1033        if(leadUnitValue!=initialValue) {
   1034            limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
   1035            for(/* i=latin1Length */; i<limit; ++i) {
   1036                p32[i]=leadUnitValue;
   1037            }
   1038        }
   1039    }
   1040 
   1041    trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
   1042 
   1043    return actualLength;
   1044 }
   1045 
   1046 /* enumeration -------------------------------------------------------------- */
   1047 
   1048 /* default UTrieEnumValue() returns the input value itself */
   1049 static uint32_t U_CALLCONV
   1050 enumSameValue(const void * /*context*/, uint32_t value) {
   1051    return value;
   1052 }
   1053 
   1054 /**
   1055 * Enumerate all ranges of code points with the same relevant values.
   1056 * The values are transformed from the raw trie entries by the enumValue function.
   1057 */
   1058 U_CAPI void U_EXPORT2
   1059 utrie_enum(const UTrie *trie,
   1060           UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
   1061    const uint32_t *data32;
   1062    const uint16_t *idx;
   1063 
   1064    uint32_t value, prevValue, initialValue;
   1065    UChar32 c, prev;
   1066    int32_t l, i, j, block, prevBlock, nullBlock, offset;
   1067 
   1068    /* check arguments */
   1069    if(trie==nullptr || trie->index==nullptr || enumRange==nullptr) {
   1070        return;
   1071    }
   1072    if(enumValue==nullptr) {
   1073        enumValue=enumSameValue;
   1074    }
   1075 
   1076    idx=trie->index;
   1077    data32=trie->data32;
   1078 
   1079    /* get the enumeration value that corresponds to an initial-value trie data entry */
   1080    initialValue=enumValue(context, trie->initialValue);
   1081 
   1082    if(data32==nullptr) {
   1083        nullBlock=trie->indexLength;
   1084    } else {
   1085        nullBlock=0;
   1086    }
   1087 
   1088    /* set variables for previous range */
   1089    prevBlock=nullBlock;
   1090    prev=0;
   1091    prevValue=initialValue;
   1092 
   1093    /* enumerate BMP - the main loop enumerates data blocks */
   1094    for(i=0, c=0; c<=0xffff; ++i) {
   1095        if(c==0xd800) {
   1096            /* skip lead surrogate code _units_, go to lead surr. code _points_ */
   1097            i=UTRIE_BMP_INDEX_LENGTH;
   1098        } else if(c==0xdc00) {
   1099            /* go back to regular BMP code points */
   1100            i=c>>UTRIE_SHIFT;
   1101        }
   1102 
   1103        block=idx[i]<<UTRIE_INDEX_SHIFT;
   1104        if(block==prevBlock) {
   1105            /* the block is the same as the previous one, and filled with value */
   1106            c+=UTRIE_DATA_BLOCK_LENGTH;
   1107        } else if(block==nullBlock) {
   1108            /* this is the all-initial-value block */
   1109            if(prevValue!=initialValue) {
   1110                if(prev<c) {
   1111                    if(!enumRange(context, prev, c, prevValue)) {
   1112                        return;
   1113                    }
   1114                }
   1115                prevBlock=nullBlock;
   1116                prev=c;
   1117                prevValue=initialValue;
   1118            }
   1119            c+=UTRIE_DATA_BLOCK_LENGTH;
   1120        } else {
   1121            prevBlock=block;
   1122            for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1123                value=enumValue(context, data32!=nullptr ? data32[block+j] : idx[block+j]);
   1124                if(value!=prevValue) {
   1125                    if(prev<c) {
   1126                        if(!enumRange(context, prev, c, prevValue)) {
   1127                            return;
   1128                        }
   1129                    }
   1130                    if(j>0) {
   1131                        /* the block is not filled with all the same value */
   1132                        prevBlock=-1;
   1133                    }
   1134                    prev=c;
   1135                    prevValue=value;
   1136                }
   1137                ++c;
   1138            }
   1139        }
   1140    }
   1141 
   1142    /* enumerate supplementary code points */
   1143    for(l=0xd800; l<0xdc00;) {
   1144        /* lead surrogate access */
   1145        offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
   1146        if(offset==nullBlock) {
   1147            /* no entries for a whole block of lead surrogates */
   1148            if(prevValue!=initialValue) {
   1149                if(prev<c) {
   1150                    if(!enumRange(context, prev, c, prevValue)) {
   1151                        return;
   1152                    }
   1153                }
   1154                prevBlock=nullBlock;
   1155                prev=c;
   1156                prevValue=initialValue;
   1157            }
   1158 
   1159            l+=UTRIE_DATA_BLOCK_LENGTH;
   1160            c+=UTRIE_DATA_BLOCK_LENGTH<<10;
   1161            continue;
   1162        }
   1163 
   1164        value= data32!=nullptr ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
   1165 
   1166        /* enumerate trail surrogates for this lead surrogate */
   1167        offset=trie->getFoldingOffset(value);
   1168        if(offset<=0) {
   1169            /* no data for this lead surrogate */
   1170            if(prevValue!=initialValue) {
   1171                if(prev<c) {
   1172                    if(!enumRange(context, prev, c, prevValue)) {
   1173                        return;
   1174                    }
   1175                }
   1176                prevBlock=nullBlock;
   1177                prev=c;
   1178                prevValue=initialValue;
   1179            }
   1180 
   1181            /* nothing else to do for the supplementary code points for this lead surrogate */
   1182            c+=0x400;
   1183        } else {
   1184            /* enumerate code points for this lead surrogate */
   1185            i=offset;
   1186            offset+=UTRIE_SURROGATE_BLOCK_COUNT;
   1187            do {
   1188                /* copy of most of the body of the BMP loop */
   1189                block=idx[i]<<UTRIE_INDEX_SHIFT;
   1190                if(block==prevBlock) {
   1191                    /* the block is the same as the previous one, and filled with value */
   1192                    c+=UTRIE_DATA_BLOCK_LENGTH;
   1193                } else if(block==nullBlock) {
   1194                    /* this is the all-initial-value block */
   1195                    if(prevValue!=initialValue) {
   1196                        if(prev<c) {
   1197                            if(!enumRange(context, prev, c, prevValue)) {
   1198                                return;
   1199                            }
   1200                        }
   1201                        prevBlock=nullBlock;
   1202                        prev=c;
   1203                        prevValue=initialValue;
   1204                    }
   1205                    c+=UTRIE_DATA_BLOCK_LENGTH;
   1206                } else {
   1207                    prevBlock=block;
   1208                    for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
   1209                        value=enumValue(context, data32!=nullptr ? data32[block+j] : idx[block+j]);
   1210                        if(value!=prevValue) {
   1211                            if(prev<c) {
   1212                                if(!enumRange(context, prev, c, prevValue)) {
   1213                                    return;
   1214                                }
   1215                            }
   1216                            if(j>0) {
   1217                                /* the block is not filled with all the same value */
   1218                                prevBlock=-1;
   1219                            }
   1220                            prev=c;
   1221                            prevValue=value;
   1222                        }
   1223                        ++c;
   1224                    }
   1225                }
   1226            } while(++i<offset);
   1227        }
   1228 
   1229        ++l;
   1230    }
   1231 
   1232    /* deliver last range */
   1233    enumRange(context, prev, c, prevValue);
   1234 }