tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucptrie_impl.h (11336B)


      1 // © 2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // ucptrie_impl.h (modified from utrie2_impl.h)
      5 // created: 2017dec29 Markus W. Scherer
      6 
      7 #ifndef __UCPTRIE_IMPL_H__
      8 #define __UCPTRIE_IMPL_H__
      9 
     10 #include "unicode/ucptrie.h"
     11 #ifdef UCPTRIE_DEBUG
     12 #include "unicode/umutablecptrie.h"
     13 #endif
     14 
     15 // UCPTrie signature values, in platform endianness and opposite endianness.
     16 // The UCPTrie signature ASCII byte values spell "Tri3".
     17 #define UCPTRIE_SIG     0x54726933
     18 #define UCPTRIE_OE_SIG  0x33697254
     19 
     20 /**
     21 * Header data for the binary, memory-mappable representation of a UCPTrie/CodePointTrie.
     22 * @internal
     23 */
     24 struct UCPTrieHeader {
     25    /** "Tri3" in big-endian US-ASCII (0x54726933) */
     26    uint32_t signature;
     27 
     28    /**
     29     * Options bit field:
     30     * Bits 15..12: Data length bits 19..16.
     31     * Bits 11..8: Data null block offset bits 19..16.
     32     * Bits 7..6: UCPTrieType
     33     * Bits 5..3: Reserved (0).
     34     * Bits 2..0: UCPTrieValueWidth
     35     */
     36    uint16_t options;
     37 
     38    /** Total length of the index tables. */
     39    uint16_t indexLength;
     40 
     41    /** Data length bits 15..0. */
     42    uint16_t dataLength;
     43 
     44    /** Index-3 null block offset, 0x7fff or 0xffff if none. */
     45    uint16_t index3NullOffset;
     46 
     47    /** Data null block offset bits 15..0, 0xfffff if none. */
     48    uint16_t dataNullOffset;
     49 
     50    /**
     51     * First code point of the single-value range ending with U+10ffff,
     52     * rounded up and then shifted right by UCPTRIE_SHIFT_2.
     53     */
     54    uint16_t shiftedHighStart;
     55 };
     56 
     57 // Constants for use with UCPTrieHeader.options.
     58 constexpr uint16_t UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000;
     59 constexpr uint16_t UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00;
     60 constexpr uint16_t UCPTRIE_OPTIONS_RESERVED_MASK = 0x38;
     61 constexpr uint16_t UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7;
     62 
     63 /**
     64 * Value for index3NullOffset which indicates that there is no index-3 null block.
     65 * Bit 15 is unused for this value because this bit is used if the index-3 contains
     66 * 18-bit indexes.
     67 */
     68 constexpr int32_t UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff;
     69 constexpr int32_t UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff;
     70 
     71 // Internal constants.
     72 
     73 /** The length of the BMP index table. 1024=0x400 */
     74 constexpr int32_t UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT;
     75 
     76 constexpr int32_t UCPTRIE_SMALL_LIMIT = 0x1000;
     77 constexpr int32_t UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT;
     78 
     79 /** Shift size for getting the index-3 table offset. */
     80 constexpr int32_t UCPTRIE_SHIFT_3 = 4;
     81 
     82 /** Shift size for getting the index-2 table offset. */
     83 constexpr int32_t UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3;
     84 
     85 /** Shift size for getting the index-1 table offset. */
     86 constexpr int32_t UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2;
     87 
     88 /**
     89 * Difference between two shift sizes,
     90 * for getting an index-2 offset from an index-3 offset. 5=9-4
     91 */
     92 constexpr int32_t UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3;
     93 
     94 /**
     95 * Difference between two shift sizes,
     96 * for getting an index-1 offset from an index-2 offset. 5=14-9
     97 */
     98 constexpr int32_t UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2;
     99 
    100 /**
    101 * Number of index-1 entries for the BMP. (4)
    102 * This part of the index-1 table is omitted from the serialized form.
    103 */
    104 constexpr int32_t UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1;
    105 
    106 /** Number of entries in an index-2 block. 32=0x20 */
    107 constexpr int32_t UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2;
    108 
    109 /** Mask for getting the lower bits for the in-index-2-block offset. */
    110 constexpr int32_t UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1;
    111 
    112 /** Number of code points per index-2 table entry. 512=0x200 */
    113 constexpr int32_t UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2;
    114 
    115 /** Number of entries in an index-3 block. 32=0x20 */
    116 constexpr int32_t UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3;
    117 
    118 /** Mask for getting the lower bits for the in-index-3-block offset. */
    119 constexpr int32_t UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1;
    120 
    121 /** Number of entries in a small data block. 16=0x10 */
    122 constexpr int32_t UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3;
    123 
    124 /** Mask for getting the lower bits for the in-small-data-block offset. */
    125 constexpr int32_t UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1;
    126 
    127 
    128 typedef UChar32
    129 UCPTrieGetRange(const void *trie, UChar32 start,
    130                UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
    131 
    132 U_CFUNC UChar32
    133 ucptrie_internalGetRange(UCPTrieGetRange *getRange,
    134                         const void *trie, UChar32 start,
    135                         UCPMapRangeOption option, uint32_t surrogateValue,
    136                         UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
    137 
    138 #ifdef UCPTRIE_DEBUG
    139 U_CFUNC void
    140 ucptrie_printLengths(const UCPTrie *trie, const char *which);
    141 
    142 U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name);
    143 #endif
    144 
    145 /*
    146 * Format of the binary, memory-mappable representation of a UCPTrie/CodePointTrie.
    147 * For overview information see https://icu.unicode.org/design/struct/utrie
    148 *
    149 * The binary trie data should be 32-bit-aligned.
    150 * The overall layout is:
    151 *
    152 * UCPTrieHeader header; -- 16 bytes, see struct definition above
    153 * uint16_t index[header.indexLength];
    154 * uintXY_t data[header.dataLength];
    155 *
    156 * The trie data array is an array of uint16_t, uint32_t, or uint8_t,
    157 * specified via the UCPTrieValueWidth when building the trie.
    158 * The data array is 32-bit-aligned for uint32_t, otherwise 16-bit-aligned.
    159 * The overall length of the trie data is a multiple of 4 bytes.
    160 * (Padding is added at the end of the index array and/or near the end of the data array as needed.)
    161 *
    162 * The length of the data array (dataLength) is stored as an integer split across two fields
    163 * of the header struct (high bits in header.options).
    164 *
    165 * The trie type can be "fast" or "small" which determines the index structure,
    166 * specified via the UCPTrieType when building the trie.
    167 *
    168 * The type and valueWidth are stored in the header.options.
    169 * There are reserved type and valueWidth values, and reserved header.options bits.
    170 * They could be used in future format extensions.
    171 * Code reading the trie structure must fail with an error when unknown values or options are set.
    172 *
    173 * Values for ASCII character (U+0000..U+007F) can always be found at the start of the data array.
    174 *
    175 * Values for code points below a type-specific fast-indexing limit are found via two-stage lookup.
    176 * For a "fast" trie, the limit is the BMP/supplementary boundary at U+10000.
    177 * For a "small" trie, the limit is UCPTRIE_SMALL_MAX+1=U+1000.
    178 *
    179 * All code points in the range highStart..U+10FFFF map to a single highValue
    180 * which is stored at the second-to-last position of the data array.
    181 * (See UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET.)
    182 * The highStart value is header.shiftedHighStart<<UCPTRIE_SHIFT_2.
    183 * (UCPTRIE_SHIFT_2=9)
    184 *
    185 * Values for code points fast_limit..highStart-1 are found via four-stage lookup.
    186 * The data block size is smaller for this range than for the fast range.
    187 * This together with more index stages with small blocks makes this range
    188 * more easily compactable.
    189 *
    190 * There is also a trie error value stored at the last position of the data array.
    191 * (See UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET.)
    192 * It is intended to be returned for inputs that are not Unicode code points
    193 * (outside U+0000..U+10FFFF), or in string processing for ill-formed input
    194 * (unpaired surrogate in UTF-16, ill-formed UTF-8 subsequence).
    195 *
    196 * For a "fast" trie:
    197 *
    198 * The index array starts with the BMP index table for BMP code point lookup.
    199 * Its length is 1024=0x400.
    200 *
    201 * The supplementary index-1 table follows the BMP index table.
    202 * Variable length, for code points up to highStart-1.
    203 * Maximum length 64=0x40=0x100000>>UCPTRIE_SHIFT_1.
    204 * (For 0x100000 supplementary code points U+10000..U+10ffff.)
    205 *
    206 * After this index-1 table follow the variable-length index-3 and index-2 tables.
    207 *
    208 * The supplementary index tables are omitted completely
    209 * if there is only BMP data (highStart<=U+10000).
    210 *
    211 * For a "small" trie:
    212 *
    213 * The index array starts with a fast-index table for lookup of code points U+0000..U+0FFF.
    214 *
    215 * The "supplementary" index tables are always stored.
    216 * The index-1 table starts from U+0000, its maximum length is 68=0x44=0x110000>>UCPTRIE_SHIFT_1.
    217 *
    218 * For both trie types:
    219 *
    220 * The last index-2 block may be a partial block, storing indexes only for code points
    221 * below highStart.
    222 *
    223 * Lookup for ASCII code point c:
    224 *
    225 * Linear access from the start of the data array.
    226 *
    227 * value = data[c];
    228 *
    229 * Lookup for fast-range code point c:
    230 *
    231 * Shift the code point right by UCPTRIE_FAST_SHIFT=6 bits,
    232 * fetch the index array value at that offset,
    233 * add the lower code point bits, index into the data array.
    234 *
    235 * value = data[index[c>>6] + (c&0x3f)];
    236 *
    237 * (This works for ASCII as well.)
    238 *
    239 * Lookup for small-range code point c below highStart:
    240 *
    241 * Split the code point into four bit fields using several sets of shifts & masks
    242 * to read consecutive values from the index-1, index-2, index-3 and data tables.
    243 *
    244 * If all of the data block offsets in an index-3 block fit within 16 bits (up to 0xffff),
    245 * then the data block offsets are stored directly as uint16_t.
    246 *
    247 * Otherwise (this is very unusual but possible), the index-2 entry for the index-3 block
    248 * has bit 15 (0x8000) set, and each set of 8 index-3 entries is preceded by
    249 * an additional uint16_t word. Data block offsets are 18 bits wide, with the top 2 bits stored
    250 * in the additional word.
    251 *
    252 * See ucptrie_internalSmallIndex() for details.
    253 *
    254 * (In a "small" trie, this works for ASCII and below-fast_limit code points as well.)
    255 *
    256 * Compaction:
    257 *
    258 * Multiple code point ranges ("blocks") that are aligned on certain boundaries
    259 * (determined by the shifting/bit fields of code points) and
    260 * map to the same data values normally share a single subsequence of the data array.
    261 * Data blocks can also overlap partially.
    262 * (Depending on the builder code finding duplicate and overlapping blocks.)
    263 *
    264 * Iteration over same-value ranges:
    265 *
    266 * Range iteration (ucptrie_getRange()) walks the structure from a start code point
    267 * until some code point is found that maps to a different value;
    268 * the end of the returned range is just before that.
    269 *
    270 * The header.dataNullOffset (split across two header fields, high bits in header.options)
    271 * is the offset of a widely shared data block filled with one single value.
    272 * It helps quickly skip over large ranges of data with that value.
    273 * The builder must ensure that if the start of any data block (fast or small)
    274 * matches the dataNullOffset, then the whole block must be filled with the null value.
    275 * Special care must be taken if there is no fast null data block
    276 * but a small one, which is shorter, and it matches the *start* of some fast data block.
    277 *
    278 * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block
    279 * where all index entries point to the dataNullOffset.
    280 * If there is no such data or index-3 block, then these offsets are set to
    281 * values that cannot be reached (data offset out of range/reserved index offset),
    282 * normally UCPTRIE_NO_DATA_NULL_OFFSET or UCPTRIE_NO_INDEX3_NULL_OFFSET respectively.
    283 */
    284 
    285 #endif