tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdatabuilder.h (10336B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatabuilder.h
      9 *
     10 * created on: 2012apr01
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONDATABUILDER_H__
     15 #define __COLLATIONDATABUILDER_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/uniset.h"
     22 #include "unicode/unistr.h"
     23 #include "unicode/uversion.h"
     24 #include "collation.h"
     25 #include "collationdata.h"
     26 #include "collationsettings.h"
     27 #include "normalizer2impl.h"
     28 #include "utrie2.h"
     29 #include "uvectr32.h"
     30 #include "uvectr64.h"
     31 #include "uvector.h"
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 struct ConditionalCE32;
     36 
     37 class CollationFastLatinBuilder;
     38 class CopyHelper;
     39 class DataBuilderCollationIterator;
     40 class UCharsTrieBuilder;
     41 
     42 /**
     43 * Low-level CollationData builder.
     44 * Takes (character, CE) pairs and builds them into runtime data structures.
     45 * Supports characters with context prefixes and contraction suffixes.
     46 */
     47 class U_I18N_API CollationDataBuilder : public UObject {
     48 public:
     49    /**
     50     * Collation element modifier. Interface class for a modifier
     51     * that changes a tailoring builder's temporary CEs to final CEs.
     52     * Called for every non-special CE32 and every expansion CE.
     53     */
     54    class CEModifier : public UObject {
     55    public:
     56        virtual ~CEModifier();
     57        /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
     58        virtual int64_t modifyCE32(uint32_t ce32) const = 0;
     59        /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
     60        virtual int64_t modifyCE(int64_t ce) const = 0;
     61    };
     62 
     63    CollationDataBuilder(UBool icu4xMode, UErrorCode &errorCode);
     64 
     65    virtual ~CollationDataBuilder();
     66 
     67    void initForTailoring(const CollationData *b, UErrorCode &errorCode);
     68 
     69    virtual UBool isCompressibleLeadByte(uint32_t b) const;
     70 
     71    inline UBool isCompressiblePrimary(uint32_t p) const {
     72        return isCompressibleLeadByte(p >> 24);
     73    }
     74 
     75    /**
     76     * @return true if this builder has mappings (e.g., add() has been called)
     77     */
     78    UBool hasMappings() const { return modified; }
     79 
     80    /**
     81     * @return true if c has CEs in this builder
     82     */
     83    UBool isAssigned(UChar32 c) const;
     84 
     85    /**
     86     * @return the three-byte primary if c maps to a single such CE and has no context data,
     87     * otherwise returns 0.
     88     */
     89    uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
     90 
     91    /**
     92     * @return the single CE for c.
     93     * Sets an error code if c does not have a single CE.
     94     */
     95    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
     96 
     97    void add(const UnicodeString &prefix, const UnicodeString &s,
     98             const int64_t ces[], int32_t cesLength,
     99             UErrorCode &errorCode);
    100 
    101    /**
    102     * Encodes the ces as either the returned ce32 by itself,
    103     * or by storing an expansion, with the returned ce32 referring to that.
    104     *
    105     * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
    106     */
    107    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
    108    void addCE32(const UnicodeString &prefix, const UnicodeString &s,
    109                 uint32_t ce32, UErrorCode &errorCode);
    110 
    111    /**
    112     * Sets three-byte-primary CEs for a range of code points in code point order,
    113     * if it is worth doing; otherwise no change is made.
    114     * None of the code points in the range should have complex mappings so far
    115     * (expansions/contractions/prefixes).
    116     * @param start first code point
    117     * @param end last code point (inclusive)
    118     * @param primary primary weight for 'start'
    119     * @param step per-code point primary-weight increment
    120     * @param errorCode ICU in/out error code
    121     * @return true if an OFFSET_TAG range was used for start..end
    122     */
    123    UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
    124                               uint32_t primary, int32_t step,
    125                               UErrorCode &errorCode);
    126 
    127    /**
    128     * Sets three-byte-primary CEs for a range of code points in code point order.
    129     * Sets range values if that is worth doing, or else individual values.
    130     * None of the code points in the range should have complex mappings so far
    131     * (expansions/contractions/prefixes).
    132     * @param start first code point
    133     * @param end last code point (inclusive)
    134     * @param primary primary weight for 'start'
    135     * @param step per-code point primary-weight increment
    136     * @param errorCode ICU in/out error code
    137     * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
    138     */
    139    uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
    140                                          uint32_t primary, int32_t step,
    141                                          UErrorCode &errorCode);
    142 
    143    /**
    144     * Copies all mappings from the src builder, with modifications.
    145     * This builder here must not be built yet, and should be empty.
    146     */
    147    void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
    148                  UErrorCode &errorCode);
    149 
    150    void optimize(const UnicodeSet &set, UErrorCode &errorCode);
    151    void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
    152 
    153    void enableFastLatin() { fastLatinEnabled = true; }
    154    virtual void build(CollationData &data, UErrorCode &errorCode);
    155 
    156    /**
    157     * Looks up CEs for s and appends them to the ces array.
    158     * Does not handle normalization: s should be in FCD form.
    159     *
    160     * Does not write completely ignorable CEs.
    161     * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
    162     *
    163     * @return incremented cesLength
    164     */
    165    int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
    166    int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
    167                   int64_t ces[], int32_t cesLength);
    168 
    169 protected:
    170    friend class CopyHelper;
    171    friend class DataBuilderCollationIterator;
    172 
    173    uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
    174 
    175    int32_t addCE(int64_t ce, UErrorCode &errorCode);
    176    int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
    177    int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
    178 
    179    inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
    180        return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
    181    }
    182    inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
    183        return getConditionalCE32(Collation::indexFromCE32(ce32));
    184    }
    185 
    186    static uint32_t makeBuilderContextCE32(int32_t index) {
    187        return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
    188    }
    189    static inline UBool isBuilderContextCE32(uint32_t ce32) {
    190        return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
    191    }
    192 
    193    static uint32_t encodeOneCEAsCE32(int64_t ce);
    194    uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
    195    uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
    196    uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
    197 
    198    uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
    199    /**
    200     * Copies base contractions to a list of ConditionalCE32.
    201     * Sets cond->next to the index of the first new item
    202     * and returns the index of the last new item.
    203     */
    204    int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
    205                                         ConditionalCE32 *cond, UErrorCode &errorCode);
    206 
    207    UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
    208    void setDigitTags(UErrorCode &errorCode);
    209    void setLeadSurrogates(UErrorCode &errorCode);
    210 
    211    void buildMappings(CollationData &data, UErrorCode &errorCode);
    212 
    213    void clearContexts();
    214    void buildContexts(UErrorCode &errorCode);
    215    uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
    216    int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
    217                           UErrorCode &errorCode);
    218 
    219    void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
    220 
    221    int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
    222 
    223    static UChar32 jamoCpFromIndex(int32_t i) {
    224        // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
    225        if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
    226        i -= Hangul::JAMO_L_COUNT;
    227        if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
    228        i -= Hangul::JAMO_V_COUNT;
    229        // i < 27
    230        return Hangul::JAMO_T_BASE + 1 + i;
    231    }
    232 
    233    /** @see Collation::BUILDER_DATA_TAG */
    234    static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
    235 
    236    const Normalizer2Impl &nfcImpl;
    237    const CollationData *base;
    238    const CollationSettings *baseSettings;
    239    UTrie2 *trie;
    240    UVector32 ce32s;
    241    UVector64 ce64s;
    242    UVector conditionalCE32s;  // vector of ConditionalCE32
    243    // Characters that have context (prefixes or contraction suffixes).
    244    UnicodeSet contextChars;
    245    // Serialized UCharsTrie structures for finalized contexts.
    246    UnicodeString contexts;
    247 private:
    248    /**
    249     * The "era" of building intermediate contexts.
    250     * When the array of cached, temporary contexts overflows, then clearContexts()
    251     * removes them all and invalidates the builtCE32 that used to point to built tries.
    252     * See ConditionalCE32::era.
    253     */
    254    int32_t contextsEra = 0;
    255 protected:
    256    UnicodeSet unsafeBackwardSet;
    257    /**
    258     * For ICU4X only: The starters that occur in some contraction
    259     * in a position that is neither the first nor the last code point
    260     * of the contraction.
    261     */
    262    UnicodeSet contractionMiddleStarter;
    263    UBool modified;
    264    UBool icu4xMode;
    265 
    266    UBool fastLatinEnabled;
    267    CollationFastLatinBuilder *fastLatinBuilder;
    268 
    269    DataBuilderCollationIterator *collIter;
    270 };
    271 
    272 U_NAMESPACE_END
    273 
    274 #endif  // !UCONFIG_NO_COLLATION
    275 #endif  // __COLLATIONDATABUILDER_H__