tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utrie.h (31117B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  utrie.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2001nov08
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #ifndef __UTRIE_H__
     20 #define __UTRIE_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/utf16.h"
     24 
     25 U_CDECL_BEGIN
     26 
     27 /**
     28 * \file
     29 *
     30 * This is a common implementation of a "folded" trie.
     31 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
     32 * Unicode code points (0..0x10ffff).
     33 *
     34 * This implementation is optimized for getting values while walking forward
     35 * through a UTF-16 string.
     36 * Therefore, the simplest and fastest access macros are the
     37 * _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
     38 *
     39 * The _FROM_BMP() macros are a little more complicated; they get values
     40 * even for lead surrogate code _points_, while the _FROM_LEAD() macros
     41 * get special "folded" values for lead surrogate code _units_ if
     42 * there is relevant data associated with them.
     43 * From such a folded value, an offset needs to be extracted to supply
     44 * to the _FROM_OFFSET_TRAIL() macros.
     45 *
     46 * Most of the more complex (and more convenient) functions/macros call a callback function
     47 * to get that offset from the folded value for a lead surrogate unit.
     48 */
     49 
     50 /**
     51 * Trie constants, defining shift widths, index array lengths, etc.
     52 */
     53 enum {
     54    /** Shift size for shifting right the input index. 1..9 */
     55    UTRIE_SHIFT=5,
     56 
     57    /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
     58    UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
     59 
     60    /** Mask for getting the lower bits from the input index. */
     61    UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
     62 
     63    /**
     64     * Lead surrogate code points' index displacement in the index array.
     65     * 0x10000-0xd800=0x2800
     66     */
     67    UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
     68 
     69    /**
     70     * Shift size for shifting left the index array values.
     71     * Increases possible data size with 16-bit index values at the cost
     72     * of compactability.
     73     * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
     74     * 0..UTRIE_SHIFT
     75     */
     76    UTRIE_INDEX_SHIFT=2,
     77 
     78    /** The alignment size of a stage 2 data block. Also the granularity for compaction. */
     79    UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
     80 
     81    /** Number of bits of a trail surrogate that are used in index table lookups. */
     82    UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
     83 
     84    /**
     85     * Number of index (stage 1) entries per lead surrogate.
     86     * Same as number of index entries for 1024 trail surrogates,
     87     * ==0x400>>UTRIE_SHIFT
     88     */
     89    UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
     90 
     91    /** Length of the BMP portion of the index (stage 1) array. */
     92    UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
     93 };
     94 
     95 /**
     96 * Length of the index (stage 1) array before folding.
     97 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
     98 */
     99 #define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
    100 
    101 /**
    102 * Maximum length of the runtime data (stage 2) array.
    103 * Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
    104 */
    105 #define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
    106 
    107 /**
    108 * Maximum length of the build-time data (stage 2) array.
    109 * The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
    110 * (Number of Unicode code points + one all-initial-value block +
    111 *  possible duplicate entries for 1024 lead surrogates.)
    112 */
    113 #define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
    114 
    115 /**
    116 * Number of bytes for a dummy trie.
    117 * A dummy trie is an empty runtime trie, used when a real data trie cannot
    118 * be loaded.
    119 * The number of bytes works for Latin-1-linear tries with 32-bit data
    120 * (worst case).
    121 *
    122 * Calculation:
    123 *   BMP index + 1 index block for lead surrogate code points +
    124 *   Latin-1-linear array + 1 data block for lead surrogate code points
    125 *
    126 * Latin-1: if(UTRIE_SHIFT<=8) { 256 } else { included in first data block }
    127 *
    128 * @see utrie_unserializeDummy
    129 */
    130 #define UTRIE_DUMMY_SIZE ((UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT)*2+(UTRIE_SHIFT<=8?256:UTRIE_DATA_BLOCK_LENGTH)*4+UTRIE_DATA_BLOCK_LENGTH*4)
    131 
    132 /**
    133 * Runtime UTrie callback function.
    134 * Extract from a lead surrogate's data the
    135 * index array offset of the indexes for that lead surrogate.
    136 *
    137 * @param data data value for a surrogate from the trie, including the folding offset
    138 * @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
    139 */
    140 typedef int32_t U_CALLCONV
    141 UTrieGetFoldingOffset(uint32_t data);
    142 
    143 /**
    144 * Run-time Trie structure.
    145 *
    146 * Either the data table is 16 bits wide and accessed via the index
    147 * pointer, with each index item increased by indexLength;
    148 * in this case, data32==NULL.
    149 *
    150 * Or the data table is 32 bits wide and accessed via the data32 pointer.
    151 */
    152 struct UTrie {
    153    const uint16_t *index;
    154    const uint32_t *data32; /* NULL if 16b data is used via index */
    155 
    156    /**
    157     * This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
    158     * If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
    159     *
    160     * utrie_unserialize() sets a default function which simply returns
    161     * the lead surrogate's value itself - which is the inverse of the default
    162     * folding function used by utrie_serialize().
    163     *
    164     * @see UTrieGetFoldingOffset
    165     */
    166    UTrieGetFoldingOffset *getFoldingOffset;
    167 
    168    int32_t indexLength, dataLength;
    169    uint32_t initialValue;
    170    UBool isLatin1Linear;
    171 };
    172 
    173 #ifndef __UTRIE2_H__
    174 typedef struct UTrie UTrie;
    175 #endif
    176 
    177 /** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
    178 #define _UTRIE_GET_RAW(trie, data, offset, c16) \
    179    (trie)->data[ \
    180        ((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
    181        ((c16)&UTRIE_MASK) \
    182    ]
    183 
    184 /** Internal trie getter from a pair of surrogates */
    185 #define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
    186    int32_t __offset; \
    187 \
    188    /* get data for lead surrogate */ \
    189    (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
    190    __offset=(trie)->getFoldingOffset(result); \
    191 \
    192    /* get the real data from the folded lead/trail units */ \
    193    if(__offset>0) { \
    194        (result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
    195    } else { \
    196        (result)=(resultType)((trie)->initialValue); \
    197    } \
    198 } UPRV_BLOCK_MACRO_END
    199 
    200 /** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
    201 #define _UTRIE_GET_FROM_BMP(trie, data, c16) \
    202    _UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16)
    203 
    204 /**
    205 * Internal trie getter from a code point.
    206 * Could be faster(?) but longer with
    207 *   if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
    208 */
    209 #define _UTRIE_GET(trie, data, c32, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
    210    if((uint32_t)(c32)<=0xffff) { \
    211        /* BMP code points */ \
    212        (result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
    213    } else if((uint32_t)(c32)<=0x10ffff) { \
    214        /* supplementary code point */ \
    215        UChar __lead16=U16_LEAD(c32); \
    216        _UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result, resultType); \
    217    } else { \
    218        /* out of range */ \
    219        (result)=(resultType)((trie)->initialValue); \
    220    } \
    221 } UPRV_BLOCK_MACRO_END
    222 
    223 /** Internal next-post-increment: get the next code point (c, c2) and its data */
    224 #define _UTRIE_NEXT(trie, data, src, limit, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
    225    (c)=*(src)++; \
    226    if(!U16_IS_LEAD(c)) { \
    227        (c2)=0; \
    228        (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
    229    } else if((src)!=(limit) && U16_IS_TRAIL((c2)=*(src))) { \
    230        ++(src); \
    231        _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
    232    } else { \
    233        /* unpaired lead surrogate code point */ \
    234        (c2)=0; \
    235        (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
    236    } \
    237 } UPRV_BLOCK_MACRO_END
    238 
    239 /** Internal previous: get the previous code point (c, c2) and its data */
    240 #define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result, resultType) UPRV_BLOCK_MACRO_BEGIN { \
    241    (c)=*--(src); \
    242    if(!U16_IS_SURROGATE(c)) { \
    243        (c2)=0; \
    244        (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
    245    } else if(!U16_IS_SURROGATE_LEAD(c)) { \
    246        /* trail surrogate */ \
    247        if((start)!=(src) && U16_IS_LEAD((c2)=*((src)-1))) { \
    248            --(src); \
    249            (result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
    250            _UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result), resultType); \
    251        } else { \
    252            /* unpaired trail surrogate code point */ \
    253            (c2)=0; \
    254            (result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
    255        } \
    256    } else { \
    257        /* unpaired lead surrogate code point */ \
    258        (c2)=0; \
    259        (result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
    260    } \
    261 } UPRV_BLOCK_MACRO_END
    262 
    263 /* Public UTrie API ---------------------------------------------------------*/
    264 
    265 /**
    266 * Get a pointer to the contiguous part of the data array
    267 * for the Latin-1 range (U+0000..U+00ff).
    268 * Must be used only if the Latin-1 range is in fact linear
    269 * (trie->isLatin1Linear).
    270 *
    271 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    272 * @return (const uint16_t *) pointer to values for Latin-1 code points
    273 */
    274 #define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
    275 
    276 /**
    277 * Get a pointer to the contiguous part of the data array
    278 * for the Latin-1 range (U+0000..U+00ff).
    279 * Must be used only if the Latin-1 range is in fact linear
    280 * (trie->isLatin1Linear).
    281 *
    282 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    283 * @return (const uint32_t *) pointer to values for Latin-1 code points
    284 */
    285 #define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
    286 
    287 /**
    288 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
    289 * c16 may be a lead surrogate, which may have a value including a folding offset.
    290 *
    291 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    292 * @param c16 (UChar, in) the input BMP code point
    293 * @return (uint16_t) trie lookup result
    294 */
    295 #define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
    296 
    297 /**
    298 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
    299 * c16 may be a lead surrogate, which may have a value including a folding offset.
    300 *
    301 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    302 * @param c16 (UChar, in) the input BMP code point
    303 * @return (uint32_t) trie lookup result
    304 */
    305 #define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
    306 
    307 /**
    308 * Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
    309 * Even lead surrogate code points are treated as normal code points,
    310 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
    311 *
    312 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    313 * @param c16 (UChar, in) the input BMP code point
    314 * @return (uint16_t) trie lookup result
    315 */
    316 #define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
    317 
    318 /**
    319 * Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
    320 * Even lead surrogate code points are treated as normal code points,
    321 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
    322 *
    323 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    324 * @param c16 (UChar, in) the input BMP code point
    325 * @return (uint32_t) trie lookup result
    326 */
    327 #define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
    328 
    329 /**
    330 * Get a 16-bit trie value from a code point.
    331 * Even lead surrogate code points are treated as normal code points,
    332 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
    333 *
    334 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    335 * @param c32 (UChar32, in) the input code point
    336 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    337 */
    338 #define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result, uint16_t)
    339 
    340 /**
    341 * Get a 32-bit trie value from a code point.
    342 * Even lead surrogate code points are treated as normal code points,
    343 * with unfolded values that may differ from _FROM_LEAD() macro results for them.
    344 *
    345 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    346 * @param c32 (UChar32, in) the input code point
    347 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    348 */
    349 #define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result, uint32_t)
    350 
    351 /**
    352 * Get the next code point (c, c2), post-increment src,
    353 * and get a 16-bit value from the trie.
    354 *
    355 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    356 * @param src (const UChar *, in/out) the source text pointer
    357 * @param limit (const UChar *, in) the limit pointer for the text, or NULL
    358 * @param c (UChar, out) variable for the BMP or lead code unit
    359 * @param c2 (UChar, out) variable for 0 or the trail code unit
    360 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    361 */
    362 #define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result, uint16_t)
    363 
    364 /**
    365 * Get the next code point (c, c2), post-increment src,
    366 * and get a 32-bit value from the trie.
    367 *
    368 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    369 * @param src (const UChar *, in/out) the source text pointer
    370 * @param limit (const UChar *, in) the limit pointer for the text, or NULL
    371 * @param c (UChar, out) variable for the BMP or lead code unit
    372 * @param c2 (UChar, out) variable for 0 or the trail code unit
    373 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    374 */
    375 #define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result, uint32_t)
    376 
    377 /**
    378 * Get the previous code point (c, c2), pre-decrement src,
    379 * and get a 16-bit value from the trie.
    380 *
    381 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    382 * @param start (const UChar *, in) the start pointer for the text, or NULL
    383 * @param src (const UChar *, in/out) the source text pointer
    384 * @param c (UChar, out) variable for the BMP or lead code unit
    385 * @param c2 (UChar, out) variable for 0 or the trail code unit
    386 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    387 */
    388 #define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result, uint16_t)
    389 
    390 /**
    391 * Get the previous code point (c, c2), pre-decrement src,
    392 * and get a 32-bit value from the trie.
    393 *
    394 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    395 * @param start (const UChar *, in) the start pointer for the text, or NULL
    396 * @param src (const UChar *, in/out) the source text pointer
    397 * @param c (UChar, out) variable for the BMP or lead code unit
    398 * @param c2 (UChar, out) variable for 0 or the trail code unit
    399 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    400 */
    401 #define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result, uint32_t)
    402 
    403 /**
    404 * Get a 16-bit trie value from a pair of surrogates.
    405 *
    406 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    407 * @param c (UChar, in) a lead surrogate
    408 * @param c2 (UChar, in) a trail surrogate
    409 * @param result (uint16_t, out) uint16_t variable for the trie lookup result
    410 */
    411 #define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result, uint16_t)
    412 
    413 /**
    414 * Get a 32-bit trie value from a pair of surrogates.
    415 *
    416 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    417 * @param c (UChar, in) a lead surrogate
    418 * @param c2 (UChar, in) a trail surrogate
    419 * @param result (uint32_t, out) uint32_t variable for the trie lookup result
    420 */
    421 #define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result, uint32_t)
    422 
    423 /**
    424 * Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
    425 * and a trail surrogate.
    426 *
    427 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    428 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
    429 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
    430 * @return (uint16_t) trie lookup result
    431 */
    432 #define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
    433 
    434 /**
    435 * Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
    436 * and a trail surrogate.
    437 *
    438 * @param trie (const UTrie *, in) a pointer to the runtime trie structure
    439 * @param offset (int32_t, in) the folding offset from the value of a lead surrogate
    440 * @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
    441 * @return (uint32_t) trie lookup result
    442 */
    443 #define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
    444 
    445 /* enumeration callback types */
    446 
    447 /**
    448 * Callback from utrie_enum(), extracts a uint32_t value from a
    449 * trie value. This value will be passed on to the UTrieEnumRange function.
    450 *
    451 * @param context an opaque pointer, as passed into utrie_enum()
    452 * @param value a value from the trie
    453 * @return the value that is to be passed on to the UTrieEnumRange function
    454 */
    455 typedef uint32_t U_CALLCONV
    456 UTrieEnumValue(const void *context, uint32_t value);
    457 
    458 /**
    459 * Callback from utrie_enum(), is called for each contiguous range
    460 * of code points with the same value as retrieved from the trie and
    461 * transformed by the UTrieEnumValue function.
    462 *
    463 * The callback function can stop the enumeration by returning false.
    464 *
    465 * @param context an opaque pointer, as passed into utrie_enum()
    466 * @param start the first code point in a contiguous range with value
    467 * @param limit one past the last code point in a contiguous range with value
    468 * @param value the value that is set for all code points in [start..limit[
    469 * @return false to stop the enumeration
    470 */
    471 typedef UBool U_CALLCONV
    472 UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
    473 
    474 /**
    475 * Enumerate efficiently all values in a trie.
    476 * For each entry in the trie, the value to be delivered is passed through
    477 * the UTrieEnumValue function.
    478 * The value is unchanged if that function pointer is NULL.
    479 *
    480 * For each contiguous range of code points with a given value,
    481 * the UTrieEnumRange function is called.
    482 *
    483 * @param trie a pointer to the runtime trie structure
    484 * @param enumValue a pointer to a function that may transform the trie entry value,
    485 *                  or NULL if the values from the trie are to be used directly
    486 * @param enumRange a pointer to a function that is called for each contiguous range
    487 *                  of code points with the same value
    488 * @param context an opaque pointer that is passed on to the callback functions
    489 */
    490 U_CAPI void U_EXPORT2
    491 utrie_enum(const UTrie *trie,
    492           UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
    493 
    494 /**
    495 * Unserialize a trie from 32-bit-aligned memory.
    496 * Inverse of utrie_serialize().
    497 * Fills the UTrie runtime trie structure with the settings for the trie data.
    498 *
    499 * @param trie a pointer to the runtime trie structure
    500 * @param data a pointer to 32-bit-aligned memory containing trie data
    501 * @param length the number of bytes available at data
    502 * @param pErrorCode an in/out ICU UErrorCode
    503 * @return the number of bytes at data taken up by the trie data
    504 */
    505 U_CAPI int32_t U_EXPORT2
    506 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode);
    507 
    508 /**
    509 * "Unserialize" a dummy trie.
    510 * A dummy trie is an empty runtime trie, used when a real data trie cannot
    511 * be loaded.
    512 *
    513 * The input memory is filled so that the trie always returns the initialValue,
    514 * or the leadUnitValue for lead surrogate code points.
    515 * The Latin-1 part is always set up to be linear.
    516 *
    517 * @param trie a pointer to the runtime trie structure
    518 * @param data a pointer to 32-bit-aligned memory to be filled with the dummy trie data
    519 * @param length the number of bytes available at data (recommended to use UTRIE_DUMMY_SIZE)
    520 * @param initialValue the initial value that is set for all code points
    521 * @param leadUnitValue the value for lead surrogate code _units_ that do not
    522 *                      have associated supplementary data
    523 * @param pErrorCode an in/out ICU UErrorCode
    524 *
    525 * @see UTRIE_DUMMY_SIZE
    526 * @see utrie_open
    527 */
    528 U_CAPI int32_t U_EXPORT2
    529 utrie_unserializeDummy(UTrie *trie,
    530                       void *data, int32_t length,
    531                       uint32_t initialValue, uint32_t leadUnitValue,
    532                       UBool make16BitTrie,
    533                       UErrorCode *pErrorCode);
    534 
    535 /**
    536 * Default implementation for UTrie.getFoldingOffset, set automatically by
    537 * utrie_unserialize().
    538 * Simply returns the lead surrogate's value itself - which is the inverse
    539 * of the default folding function used by utrie_serialize().
    540 * Exported for static const UTrie structures.
    541 *
    542 * @see UTrieGetFoldingOffset
    543 */
    544 U_CAPI int32_t U_EXPORT2
    545 utrie_defaultGetFoldingOffset(uint32_t data);
    546 
    547 /* Building a trie ----------------------------------------------------------*/
    548 
    549 /**
    550 * Build-time trie structure.
    551 * Opaque definition, here only to make fillIn parameters possible
    552 * for utrie_open() and utrie_clone().
    553 */
    554 struct UNewTrie {
    555    /**
    556     * Index values at build-time are 32 bits wide for easier processing.
    557     * Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
    558     */
    559    int32_t index[UTRIE_MAX_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT];
    560    uint32_t *data;
    561 
    562    uint32_t leadUnitValue;
    563    int32_t indexLength, dataCapacity, dataLength;
    564    UBool isAllocated, isDataAllocated;
    565    UBool isLatin1Linear, isCompacted;
    566 
    567    /**
    568     * Map of adjusted indexes, used in utrie_compact().
    569     * Maps from original indexes to new ones.
    570     */
    571    int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
    572 };
    573 
    574 typedef struct UNewTrie UNewTrie;
    575 
    576 /**
    577 * Build-time trie callback function, used with utrie_serialize().
    578 * This function calculates a lead surrogate's value including a folding offset
    579 * from the 1024 supplementary code points [start..start+1024[ .
    580 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
    581 *
    582 * The folding offset is provided by the caller.
    583 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
    584 * Instead of the offset itself, n can be stored in 10 bits -
    585 * or fewer if it can be assumed that few lead surrogates have associated data.
    586 *
    587 * The returned value must be
    588 * - not zero if and only if there is relevant data
    589 *   for the corresponding 1024 supplementary code points
    590 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
    591 *
    592 * @return a folded value, or 0 if there is no relevant data for the lead surrogate.
    593 */
    594 typedef uint32_t U_CALLCONV
    595 UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
    596 
    597 /**
    598 * Open a build-time trie structure.
    599 * The size of the build-time data array is specified to avoid allocating a large
    600 * array in all cases. The array itself can also be passed in.
    601 *
    602 * Although the trie is never fully expanded to a linear array, especially when
    603 * utrie_setRange32() is used, the data array could be large during build time.
    604 * The maximum length is
    605 * UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
    606 * (Number of Unicode code points + one all-initial-value block +
    607 *  possible duplicate entries for 1024 lead surrogates.)
    608 * (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
    609 *
    610 * @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
    611 *               NULL if one is to be allocated
    612 * @param aliasData a pointer to a data array to be used (will not be released), or
    613 *                  NULL if one is to be allocated
    614 * @param maxDataLength the capacity of aliasData (if not NULL) or
    615 *                      the length of the data array to be allocated
    616 * @param initialValue the initial value that is set for all code points
    617 * @param leadUnitValue the value for lead surrogate code _units_ that do not
    618 *                      have associated supplementary data
    619 * @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
    620 *                     kept in a linear, contiguous part of the data array
    621 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
    622 */
    623 U_CAPI UNewTrie * U_EXPORT2
    624 utrie_open(UNewTrie *fillIn,
    625           uint32_t *aliasData, int32_t maxDataLength,
    626           uint32_t initialValue, uint32_t leadUnitValue,
    627           UBool latin1Linear);
    628 
    629 /**
    630 * Clone a build-time trie structure with all entries.
    631 *
    632 * @param fillIn like in utrie_open()
    633 * @param other the build-time trie structure to clone
    634 * @param aliasData like in utrie_open(),
    635 *                  used if aliasDataLength>=(capacity of other's data array)
    636 * @param aliasDataLength the length of aliasData
    637 * @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
    638 */
    639 U_CAPI UNewTrie * U_EXPORT2
    640 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
    641 
    642 /**
    643 * Close a build-time trie structure, and release memory
    644 * that was allocated by utrie_open() or utrie_clone().
    645 *
    646 * @param trie the build-time trie
    647 */
    648 U_CAPI void U_EXPORT2
    649 utrie_close(UNewTrie *trie);
    650 
    651 /**
    652 * Get the data array of a build-time trie.
    653 * The data may be modified, but entries that are equal before
    654 * must still be equal after modification.
    655 *
    656 * @param trie the build-time trie
    657 * @param pLength (out) a pointer to a variable that receives the number
    658 *                of entries in the data array
    659 * @return the data array
    660 */
    661 U_CAPI uint32_t * U_EXPORT2
    662 utrie_getData(UNewTrie *trie, int32_t *pLength);
    663 
    664 /**
    665 * Set a value for a code point.
    666 *
    667 * @param trie the build-time trie
    668 * @param c the code point
    669 * @param value the value
    670 * @return false if a failure occurred (illegal argument or data array overrun)
    671 */
    672 U_CAPI UBool U_EXPORT2
    673 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
    674 
    675 /**
    676 * Get a value from a code point as stored in the build-time trie.
    677 *
    678 * @param trie the build-time trie
    679 * @param c the code point
    680 * @param pInBlockZero if not NULL, then *pInBlockZero is set to true
    681 *                     iff the value is retrieved from block 0;
    682 *                     block 0 is the all-initial-value initial block
    683 * @return the value
    684 */
    685 U_CAPI uint32_t U_EXPORT2
    686 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
    687 
    688 /**
    689 * Set a value in a range of code points [start..limit[.
    690 * All code points c with start<=c<limit will get the value if
    691 * overwrite is true or if the old value is 0.
    692 *
    693 * @param trie the build-time trie
    694 * @param start the first code point to get the value
    695 * @param limit one past the last code point to get the value
    696 * @param value the value
    697 * @param overwrite flag for whether old non-initial values are to be overwritten
    698 * @return false if a failure occurred (illegal argument or data array overrun)
    699 */
    700 U_CAPI UBool U_EXPORT2
    701 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
    702 
    703 /**
    704 * Compact the build-time trie after all values are set, and then
    705 * serialize it into 32-bit aligned memory.
    706 *
    707 * After this, the trie can only be serizalized again and/or closed;
    708 * no further values can be added.
    709 *
    710 * @see utrie_unserialize()
    711 *
    712 * @param trie the build-time trie
    713 * @param data a pointer to 32-bit-aligned memory for the trie data
    714 * @param capacity the number of bytes available at data
    715 * @param getFoldedValue a callback function that calculates the value for
    716 *                       a lead surrogate from all of its supplementary code points
    717 *                       and the folding offset;
    718 *                       if NULL, then a default function is used which returns just
    719 *                       the input offset when there are any non-initial-value entries
    720 * @param reduceTo16Bits flag for whether the values are to be reduced to a
    721 *                       width of 16 bits for serialization and runtime
    722 * @param pErrorCode a UErrorCode argument; among other possible error codes:
    723 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
    724 * - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
    725 * - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
    726 *
    727 * @return the number of bytes written for the trie
    728 */
    729 U_CAPI int32_t U_EXPORT2
    730 utrie_serialize(UNewTrie *trie, void *data, int32_t capacity,
    731                UNewTrieGetFoldedValue *getFoldedValue,
    732                UBool reduceTo16Bits,
    733                UErrorCode *pErrorCode);
    734 
    735 /* serialization ------------------------------------------------------------ */
    736 
    737 // UTrie signature values, in platform endianness and opposite endianness.
    738 // The UTrie signature ASCII byte values spell "Trie".
    739 #define UTRIE_SIG       0x54726965
    740 #define UTRIE_OE_SIG    0x65697254
    741 
    742 /**
    743 * Trie data structure in serialized form:
    744 *
    745 * UTrieHeader header;
    746 * uint16_t index[header.indexLength];
    747 * uint16_t data[header.dataLength];
    748 * @internal
    749 */
    750 typedef struct UTrieHeader {
    751    /** "Trie" in big-endian US-ASCII (0x54726965) */
    752    uint32_t signature;
    753 
    754    /**
    755     * options bit field:
    756     *     9    1=Latin-1 data is stored linearly at data+UTRIE_DATA_BLOCK_LENGTH
    757     *     8    0=16-bit data, 1=32-bit data
    758     *  7..4    UTRIE_INDEX_SHIFT   // 0..UTRIE_SHIFT
    759     *  3..0    UTRIE_SHIFT         // 1..9
    760     */
    761    uint32_t options;
    762 
    763    /** indexLength is a multiple of UTRIE_SURROGATE_BLOCK_COUNT */
    764    int32_t indexLength;
    765 
    766    /** dataLength>=UTRIE_DATA_BLOCK_LENGTH */
    767    int32_t dataLength;
    768 } UTrieHeader;
    769 
    770 /**
    771 * Constants for use with UTrieHeader.options.
    772 * @internal
    773 */
    774 enum {
    775    /** Mask to get the UTRIE_SHIFT value from options. */
    776    UTRIE_OPTIONS_SHIFT_MASK=0xf,
    777 
    778    /** Shift options right this much to get the UTRIE_INDEX_SHIFT value. */
    779    UTRIE_OPTIONS_INDEX_SHIFT=4,
    780 
    781    /** If set, then the data (stage 2) array is 32 bits wide. */
    782    UTRIE_OPTIONS_DATA_IS_32_BIT=0x100,
    783 
    784    /**
    785     * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array
    786     * as a simple, linear array at data+UTRIE_DATA_BLOCK_LENGTH.
    787     */
    788    UTRIE_OPTIONS_LATIN1_IS_LINEAR=0x200
    789 };
    790 
    791 U_CDECL_END
    792 
    793 #endif