tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdatareader.h (10346B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatareader.h
      9 *
     10 * created on: 2013feb07
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONDATAREADER_H__
     15 #define __COLLATIONDATAREADER_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/udata.h"
     22 
     23 struct UDataMemory;
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 struct CollationTailoring;
     28 
     29 /**
     30 * Collation binary data reader.
     31 */
     32 struct U_I18N_API CollationDataReader /* all static */ {
     33    // The following constants are also copied into source/common/ucol_swp.cpp.
     34    // Keep them in sync!
     35    enum {
     36        /**
     37         * Number of int32_t indexes.
     38         *
     39         * Can be 2 if there are only options.
     40         * Can be 7 or 8 if there are only options and a script reordering.
     41         * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
     42         */
     43        IX_INDEXES_LENGTH,  // 0
     44        /**
     45         * Bits 31..24: numericPrimary, for numeric collation
     46         *      23..16: fast Latin format version (0 = no fast Latin table)
     47         *      15.. 0: options bit set
     48         */
     49        IX_OPTIONS,
     50        IX_RESERVED2,
     51        IX_RESERVED3,
     52 
     53        /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
     54        IX_JAMO_CE32S_START,  // 4
     55 
     56        // Byte offsets from the start of the data, after the generic header.
     57        // The indexes[] are at byte offset 0, other data follows.
     58        // Each data item is aligned properly.
     59        // The data items should be in descending order of unit size,
     60        // to minimize the need for padding.
     61        // Each item's byte length is given by the difference between its offset and
     62        // the next index/offset value.
     63        /** Byte offset to int32_t reorderCodes[]. */
     64        IX_REORDER_CODES_OFFSET,
     65        /**
     66         * Byte offset to uint8_t reorderTable[].
     67         * Empty table if <256 bytes (padding only).
     68         * Otherwise 256 bytes or more (with padding).
     69         */
     70        IX_REORDER_TABLE_OFFSET,
     71        /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
     72        IX_TRIE_OFFSET,
     73 
     74        IX_RESERVED8_OFFSET,  // 8
     75        /** Byte offset to int64_t ces[]. */
     76        IX_CES_OFFSET,
     77        IX_RESERVED10_OFFSET,
     78        /** Byte offset to uint32_t ce32s[]. */
     79        IX_CE32S_OFFSET,
     80 
     81        /** Byte offset to uint32_t rootElements[]. */
     82        IX_ROOT_ELEMENTS_OFFSET,  // 12
     83        /** Byte offset to char16_t *contexts[]. */
     84        IX_CONTEXTS_OFFSET,
     85        /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
     86        IX_UNSAFE_BWD_OFFSET,
     87        /** Byte offset to uint16_t fastLatinTable[]. */
     88        IX_FAST_LATIN_TABLE_OFFSET,
     89 
     90        /** Byte offset to uint16_t scripts[]. */
     91        IX_SCRIPTS_OFFSET,  // 16
     92        /**
     93         * Byte offset to UBool compressibleBytes[].
     94         * Empty table if <256 bytes (padding only).
     95         * Otherwise 256 bytes or more (with padding).
     96         */
     97        IX_COMPRESSIBLE_BYTES_OFFSET,
     98        IX_RESERVED18_OFFSET,
     99        IX_TOTAL_SIZE
    100    };
    101 
    102    static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
    103                     CollationTailoring &tailoring, UErrorCode &errorCode);
    104 
    105    static UBool U_CALLCONV
    106    isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
    107 
    108 private:
    109    CollationDataReader() = delete;  // no constructor
    110 };
    111 
    112 /*
    113 * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
    114 * Format version 5.
    115 *
    116 * The root collation data is stored in the ucadata.icu file.
    117 * Tailorings are stored inside .res resource bundle files, with a complete file header.
    118 *
    119 * Collation data begins with a standard ICU data file header
    120 * (DataHeader, see ucmndata.h and unicode/udata.h).
    121 * The UDataInfo.dataVersion field contains the UCA and other version numbers,
    122 * see the comments for CollationTailoring.version.
    123 *
    124 * After the header, the file contains the following parts.
    125 * Constants are defined as enum values of the CollationDataReader class.
    126 * See also the Collation class.
    127 *
    128 * int32_t indexes[indexesLength];
    129 *      The indexes array has variable length.
    130 *      Some tailorings only need the length and the options,
    131 *      others only add reorderCodes and the reorderTable,
    132 *      some need to store mappings.
    133 *      Only as many indexes are stored as needed to read all of the data.
    134 *
    135 *      Index 0: indexesLength
    136 *      Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
    137 *      Index 2..3: Unused/reserved/0.
    138 *      Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
    139 *               are stored in a short, contiguous part of the ce32s array.
    140 *
    141 *      Indexes 5..19 are byte offsets in ascending order.
    142 *      Each byte offset marks the start of the next part in the data file,
    143 *      and the end of the previous one.
    144 *      When two consecutive byte offsets are the same (or too short),
    145 *      then the corresponding part is empty.
    146 *      Byte offsets are offsets from after the header,
    147 *      that is, from the beginning of the indexes[].
    148 *      Each part starts at an offset with proper alignment for its data.
    149 *      If necessary, the previous part may include padding bytes to achieve this alignment.
    150 *      The last byte offset that is stored in the indexes indicates the total size of the data
    151 *      (starting with the indexes).
    152 *
    153 * int32_t reorderCodes[]; -- empty in root
    154 *      The list of script and reordering codes.
    155 *
    156 *      Beginning with format version 5, this array may optionally
    157 *      have trailing entries with a full list of reorder ranges
    158 *      as described for CollationSettings::reorderRanges.
    159 *
    160 *      Script or reorder codes are first and do not exceed 16-bit values.
    161 *      Range limits are stored in the upper 16 bits, and are never 0.
    162 *      Split this array into reorder codes and ranges at the first entry
    163 *      with non-zero upper 16 bits.
    164 *
    165 *      If the ranges are missing but needed for split-reordered primary lead bytes,
    166 *      then they are regenerated at load time.
    167 *
    168 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
    169 *      Primary-weight lead byte permutation table.
    170 *      Normally present when the reorderCodes are, but can be built at load time.
    171 *
    172 *      Beginning with format version 5, a 0 entry at a non-zero index
    173 *      (which is otherwise an illegal value)
    174 *      means that the primary lead byte is "split"
    175 *      (there are different offsets for primaries that share that lead byte)
    176 *      and the reordering offset must be determined via the reorder ranges
    177 *      that are either stored as part of the reorderCodes array
    178 *      or regenerated at load time.
    179 *
    180 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
    181 *      The trie holds the main collation data. Each code point is mapped to a 32-bit value.
    182 *      It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
    183 *      in which case it is a special CE32 and contains a 4-bit tag and further data.
    184 *      See the Collation class for details.
    185 *
    186 *      The trie has a value for each lead surrogate code unit with some bits encoding
    187 *      collective properties of the 1024 supplementary characters whose UTF-16 form starts with
    188 *      the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
    189 *
    190 * int64_t ces[];
    191 *      64-bit CEs and expansions that cannot be stored in a more compact form.
    192 *
    193 * uint32_t ce32s[];
    194 *      CE32s for expansions in compact form, and for characters whose trie values
    195 *      contain special data.
    196 *
    197 * uint32_t rootElements[]; -- empty in all tailorings
    198 *      Compact storage for all of the CEs that occur in the root collation.
    199 *      See the CollationRootElements class.
    200 *
    201 * char16_t *contexts[];
    202 *      Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
    203 *
    204 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
    205 *      Serialized form of characters that are unsafe when iterating backwards,
    206 *      and at the end of an identical string prefix.
    207 *      Back up to a safe character.
    208 *      Lead surrogates are "unsafe" when any of their corresponding supplementary
    209 *      code points are unsafe.
    210 *      Does not include [:^lccc=0:][:^tccc=0:].
    211 *      For each tailoring, the root unsafeBackwardSet is subtracted.
    212 *      (As a result, in many tailorings no set needs to be stored.)
    213 *
    214 * uint16_t fastLatinTable[];
    215 *      Optional optimization for Latin text.
    216 *      See the CollationFastLatin class.
    217 *
    218 * uint16_t scripts[]; -- empty in all tailorings
    219 *      Format version 5:
    220 *      uint16_t numScripts;
    221 *      uint16_t scriptsIndex[numScripts+16];
    222 *      uint16_t scriptStarts[];
    223 *      See CollationData::numScripts etc.
    224 *
    225 *      Format version 4:
    226 *      Table of the reordering groups with their first and last lead bytes,
    227 *      and their script and reordering codes.
    228 *      See CollationData::scripts.
    229 *
    230 * UBool compressibleBytes[]; -- empty in all tailorings
    231 *      Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
    232 *
    233 * -----------------
    234 * Changes for formatVersion 5 (ICU 55)
    235 *
    236 * Reordering moves single scripts, not groups of scripts.
    237 * Reorder ranges are optionally appended to the reorderCodes,
    238 * and a 0 entry in the reorderTable indicates a split lead byte.
    239 * The scripts data has a new format.
    240 *
    241 * The rootElements may contain secondary and tertiary weights below common=05.
    242 * (Used for small Hiragana letters.)
    243 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
    244 * There are no other data structure changes, but builder code needs to be able to handle such data.
    245 *
    246 * The collation element for the merge separator code point U+FFFE
    247 * does not necessarily have special, unique secondary/tertiary weights any more.
    248 */
    249 
    250 U_NAMESPACE_END
    251 
    252 #endif  // !UCONFIG_NO_COLLATION
    253 #endif  // __COLLATIONDATAREADER_H__