tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdata.h (9278B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdata.h
      9 *
     10 * created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONDATA_H__
     15 #define __COLLATIONDATA_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/ucol.h"
     22 #include "unicode/uniset.h"
     23 #include "collation.h"
     24 #include "normalizer2impl.h"
     25 #include "utrie2.h"
     26 
     27 struct UDataMemory;
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 class UVector32;
     32 
     33 /**
     34 * Collation data container.
     35 * Immutable data created by a CollationDataBuilder, or loaded from a file,
     36 * or deserialized from API-provided binary data.
     37 *
     38 * Includes data for the collation base (root/default), aliased if this is not the base.
     39 */
     40 struct U_I18N_API CollationData : public UMemory {
     41    // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
     42    // parallel with the ranges, and resetting ranges that are indexed.
     43    // The reordering builder code could clone the resulting template array.
     44    static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14;
     45    static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1;
     46 
     47    static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8;
     48    /** C++ only, data reader check scriptStartsLength. */
     49    static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256;
     50 
     51    CollationData(const Normalizer2Impl &nfc)
     52            : trie(nullptr),
     53              ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr),
     54              jamoCE32s(nullptr),
     55              nfcImpl(nfc),
     56              numericPrimary(0x12000000),
     57              ce32sLength(0), cesLength(0), contextsLength(0),
     58              compressibleBytes(nullptr),
     59              unsafeBackwardSet(nullptr),
     60              fastLatinTable(nullptr), fastLatinTableLength(0),
     61              numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0),
     62              rootElements(nullptr), rootElementsLength(0) {}
     63 
     64    uint32_t getCE32(UChar32 c) const {
     65        return UTRIE2_GET32(trie, c);
     66    }
     67 
     68    uint32_t getCE32FromSupplementary(UChar32 c) const {
     69        return UTRIE2_GET32_FROM_SUPP(trie, c);
     70    }
     71 
     72    UBool isDigit(UChar32 c) const {
     73        return c < 0x660 ? c <= 0x39 && 0x30 <= c :
     74                Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
     75    }
     76 
     77    UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
     78        return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
     79    }
     80 
     81    UBool isCompressibleLeadByte(uint32_t b) const {
     82        return compressibleBytes[b];
     83    }
     84 
     85    inline UBool isCompressiblePrimary(uint32_t p) const {
     86        return isCompressibleLeadByte(p >> 24);
     87    }
     88 
     89    /**
     90     * Returns the CE32 from two contexts words.
     91     * Access to the defaultCE32 for contraction and prefix matching.
     92     */
     93    static uint32_t readCE32(const char16_t *p) {
     94        return (static_cast<uint32_t>(p[0]) << 16) | p[1];
     95    }
     96 
     97    /**
     98     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
     99     * Requires that ce32 is special.
    100     */
    101    uint32_t getIndirectCE32(uint32_t ce32) const;
    102    /**
    103     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
    104     * if ce32 is special.
    105     */
    106    uint32_t getFinalCE32(uint32_t ce32) const;
    107 
    108    /**
    109     * Computes a CE from c's ce32 which has the OFFSET_TAG.
    110     */
    111    int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
    112        int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
    113        return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
    114    }
    115 
    116    /**
    117     * Returns the single CE that c maps to.
    118     * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
    119     */
    120    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
    121 
    122    /**
    123     * Returns the FCD16 value for code point c. c must be >= 0.
    124     */
    125    uint16_t getFCD16(UChar32 c) const {
    126        return nfcImpl.getFCD16(c);
    127    }
    128 
    129    /**
    130     * Returns the first primary for the script's reordering group.
    131     * @return the primary with only the first primary lead byte of the group
    132     *         (not necessarily an actual root collator primary weight),
    133     *         or 0 if the script is unknown
    134     */
    135    uint32_t getFirstPrimaryForGroup(int32_t script) const;
    136 
    137    /**
    138     * Returns the last primary for the script's reordering group.
    139     * @return the last primary of the group
    140     *         (not an actual root collator primary weight),
    141     *         or 0 if the script is unknown
    142     */
    143    uint32_t getLastPrimaryForGroup(int32_t script) const;
    144 
    145    /**
    146     * Finds the reordering group which contains the primary weight.
    147     * @return the first script of the group, or -1 if the weight is beyond the last group
    148     */
    149    int32_t getGroupForPrimary(uint32_t p) const;
    150 
    151    int32_t getEquivalentScripts(int32_t script,
    152                                 int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
    153 
    154    /**
    155     * Writes the permutation of primary-weight ranges
    156     * for the given reordering of scripts and groups.
    157     * The caller checks for illegal arguments and
    158     * takes care of [DEFAULT] and memory allocation.
    159     *
    160     * Each list element will be a (limit, offset) pair as described
    161     * for the CollationSettings::reorderRanges.
    162     * The list will be empty if no ranges are reordered.
    163     */
    164    void makeReorderRanges(const int32_t *reorder, int32_t length,
    165                           UVector32 &ranges, UErrorCode &errorCode) const;
    166 
    167    /** @see jamoCE32s */
    168    static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
    169 
    170    /** Main lookup trie. */
    171    const UTrie2 *trie;
    172    /**
    173     * Array of CE32 values.
    174     * At index 0 there must be CE32(U+0000)
    175     * to support U+0000's special-tag for NUL-termination handling.
    176     */
    177    const uint32_t *ce32s;
    178    /** Array of CE values for expansions and OFFSET_TAG. */
    179    const int64_t *ces;
    180    /** Array of prefix and contraction-suffix matching data. */
    181    const char16_t *contexts;
    182    /** Base collation data, or nullptr if this data itself is a base. */
    183    const CollationData *base;
    184    /**
    185     * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
    186     * They are normally simple CE32s, rarely expansions.
    187     * For fast handling of HANGUL_TAG.
    188     */
    189    const uint32_t *jamoCE32s;
    190    const Normalizer2Impl &nfcImpl;
    191    /** The single-byte primary weight (xx000000) for numeric collation. */
    192    uint32_t numericPrimary;
    193 
    194    int32_t ce32sLength;
    195    int32_t cesLength;
    196    int32_t contextsLength;
    197 
    198    /** 256 flags for which primary-weight lead bytes are compressible. */
    199    const UBool *compressibleBytes;
    200    /**
    201     * Set of code points that are unsafe for starting string comparison after an identical prefix,
    202     * or in backwards CE iteration.
    203     */
    204    const UnicodeSet *unsafeBackwardSet;
    205 
    206    /**
    207     * Fast Latin table for common-Latin-text string comparisons.
    208     * Data structure see class CollationFastLatin.
    209     */
    210    const uint16_t *fastLatinTable;
    211    int32_t fastLatinTableLength;
    212 
    213    /**
    214     * Data for scripts and reordering groups.
    215     * Uses include building a reordering permutation table and
    216     * providing script boundaries to AlphabeticIndex.
    217     */
    218    int32_t numScripts;
    219    /**
    220     * The length of scriptsIndex is numScripts+16.
    221     * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
    222     * 16 special reorder codes (not all used) are mapped starting at numScripts.
    223     * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
    224     * There are special codes at the end for reorder-reserved primary ranges.
    225     *
    226     * Multiple scripts may share a range and index, for example Hira & Kana.
    227     */
    228    const uint16_t *scriptsIndex;
    229    /**
    230     * Start primary weight (top 16 bits only) for a group/script/reserved range
    231     * indexed by scriptsIndex.
    232     * The first range (separators & terminators) and the last range (trailing weights)
    233     * are not reorderable, and no scriptsIndex entry points to them.
    234     */
    235    const uint16_t *scriptStarts;
    236    int32_t scriptStartsLength;
    237 
    238    /**
    239     * Collation elements in the root collator.
    240     * Used by the CollationRootElements class. The data structure is described there.
    241     * nullptr in a tailoring.
    242     */
    243    const uint32_t *rootElements;
    244    int32_t rootElementsLength;
    245 
    246 private:
    247    int32_t getScriptIndex(int32_t script) const;
    248    void makeReorderRanges(const int32_t *reorder, int32_t length,
    249                           UBool latinMustMove,
    250                           UVector32 &ranges, UErrorCode &errorCode) const;
    251    int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
    252    int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
    253 };
    254 
    255 U_NAMESPACE_END
    256 
    257 #endif  // !UCONFIG_NO_COLLATION
    258 #endif  // __COLLATIONDATA_H__