tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationsettings.h (10412B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationsettings.h
      9 *
     10 * created on: 2013feb07
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #ifndef __COLLATIONSETTINGS_H__
     15 #define __COLLATIONSETTINGS_H__
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/ucol.h"
     22 #include "collation.h"
     23 #include "sharedobject.h"
     24 #include "umutex.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 struct CollationData;
     29 
     30 /**
     31 * Collation settings/options/attributes.
     32 * These are the values that can be changed via API.
     33 */
     34 struct U_I18N_API CollationSettings : public SharedObject {
     35    /**
     36     * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
     37     */
     38    static const int32_t CHECK_FCD = 1;
     39    /**
     40     * Options bit 1: Numeric collation.
     41     * Also known as CODAN = COllate Digits As Numbers.
     42     *
     43     * Treat digit sequences as numbers with CE sequences in numeric order,
     44     * rather than returning a normal CE for each digit.
     45     */
     46    static const int32_t NUMERIC = 2;
     47    /**
     48     * "Shifted" alternate handling, see ALTERNATE_MASK.
     49     */
     50    static const int32_t SHIFTED = 4;
     51    /**
     52     * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
     53     * Reserve values 8 and 0xc for shift-trimmed and blanked.
     54     */
     55    static const int32_t ALTERNATE_MASK = 0xc;
     56    /**
     57     * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
     58     */
     59    static const int32_t MAX_VARIABLE_SHIFT = 4;
     60    /** maxVariable options bit mask before shifting. */
     61    static const int32_t MAX_VARIABLE_MASK = 0x70;
     62    /** Options bit 7: Reserved/unused/0. */
     63    /**
     64     * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
     65     */
     66    static const int32_t UPPER_FIRST = 0x100;
     67    /**
     68     * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
     69     * unless case level is on (when they are *moved* into the separate case level).
     70     * By default, the case bits are removed from the tertiary weight (ignored).
     71     *
     72     * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
     73     * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
     74     */
     75    static const int32_t CASE_FIRST = 0x200;
     76    /**
     77     * Options bit mask for caseFirst and upperFirst, before shifting.
     78     * Same value as caseFirst==upperFirst.
     79     */
     80    static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
     81    /**
     82     * Options bit 10: Insert the case level between the secondary and tertiary levels.
     83     */
     84    static const int32_t CASE_LEVEL = 0x400;
     85    /**
     86     * Options bit 11: Compare secondary weights backwards. ("French secondary")
     87     */
     88    static const int32_t BACKWARD_SECONDARY = 0x800;
     89    /**
     90     * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
     91     * It is the top used bit field in the options. (No need to mask after shifting.)
     92     */
     93    static const int32_t STRENGTH_SHIFT = 12;
     94    /** Strength options bit mask before shifting. */
     95    static const int32_t STRENGTH_MASK = 0xf000;
     96 
     97    /** maxVariable values */
     98    enum MaxVariable {
     99        MAX_VAR_SPACE,
    100        MAX_VAR_PUNCT,
    101        MAX_VAR_SYMBOL,
    102        MAX_VAR_CURRENCY
    103    };
    104 
    105    CollationSettings()
    106            : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
    107                      (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
    108              variableTop(0),
    109              reorderTable(nullptr),
    110              minHighNoReorder(0),
    111              reorderRanges(nullptr), reorderRangesLength(0),
    112              reorderCodes(nullptr), reorderCodesLength(0), reorderCodesCapacity(0),
    113              fastLatinOptions(-1) {}
    114 
    115    CollationSettings(const CollationSettings &other);
    116    virtual ~CollationSettings();
    117 
    118    bool operator==(const CollationSettings &other) const;
    119 
    120    inline bool operator!=(const CollationSettings &other) const {
    121        return !operator==(other);
    122    }
    123 
    124    int32_t hashCode() const;
    125 
    126    void resetReordering();
    127    void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
    128                         const uint32_t *ranges, int32_t rangesLength,
    129                         const uint8_t *table, UErrorCode &errorCode);
    130    void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
    131                       UErrorCode &errorCode);
    132    void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
    133 
    134    inline UBool hasReordering() const { return reorderTable != nullptr; }
    135    static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
    136    inline uint32_t reorder(uint32_t p) const {
    137        uint8_t b = reorderTable[p >> 24];
    138        if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
    139            return (static_cast<uint32_t>(b) << 24) | (p & 0xffffff);
    140        } else {
    141            return reorderEx(p);
    142        }
    143    }
    144 
    145    void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
    146 
    147    static int32_t getStrength(int32_t options) {
    148        return options >> STRENGTH_SHIFT;
    149    }
    150 
    151    int32_t getStrength() const {
    152        return getStrength(options);
    153    }
    154 
    155    /** Sets the options bit for an on/off attribute. */
    156    void setFlag(int32_t bit, UColAttributeValue value,
    157                 int32_t defaultOptions, UErrorCode &errorCode);
    158 
    159    UColAttributeValue getFlag(int32_t bit) const {
    160        return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
    161    }
    162 
    163    void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
    164 
    165    UColAttributeValue getCaseFirst() const {
    166        int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
    167        return (option == 0) ? UCOL_OFF :
    168                (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
    169    }
    170 
    171    void setAlternateHandling(UColAttributeValue value,
    172                              int32_t defaultOptions, UErrorCode &errorCode);
    173 
    174    UColAttributeValue getAlternateHandling() const {
    175        return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
    176    }
    177 
    178    void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
    179 
    180    MaxVariable getMaxVariable() const {
    181        return static_cast<MaxVariable>((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
    182    }
    183 
    184    /**
    185     * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
    186     */
    187    static inline UBool isTertiaryWithCaseBits(int32_t options) {
    188        return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
    189    }
    190    static uint32_t getTertiaryMask(int32_t options) {
    191        // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    192        return isTertiaryWithCaseBits(options) ?
    193                Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
    194    }
    195 
    196    static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
    197        // On tertiary level, consider case bits and sort uppercase first
    198        // if caseLevel is off and caseFirst==upperFirst.
    199        return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
    200    }
    201 
    202    inline UBool dontCheckFCD() const {
    203        return (options & CHECK_FCD) == 0;
    204    }
    205 
    206    inline UBool hasBackwardSecondary() const {
    207        return (options & BACKWARD_SECONDARY) != 0;
    208    }
    209 
    210    inline UBool isNumeric() const {
    211        return (options & NUMERIC) != 0;
    212    }
    213 
    214    /** CHECK_FCD etc. */
    215    int32_t options;
    216    /** Variable-top primary weight. */
    217    uint32_t variableTop;
    218    /**
    219     * 256-byte table for reordering permutation of primary lead bytes; nullptr if no reordering.
    220     * A 0 entry at a non-zero index means that the primary lead byte is "split"
    221     * (there are different offsets for primaries that share that lead byte)
    222     * and the reordering offset must be determined via the reorderRanges.
    223     */
    224    const uint8_t *reorderTable;
    225    /** Limit of last reordered range. 0 if no reordering or no split bytes. */
    226    uint32_t minHighNoReorder;
    227    /**
    228     * Primary-weight ranges for script reordering,
    229     * to be used by reorder(p) for split-reordered primary lead bytes.
    230     *
    231     * Each entry is a (limit, offset) pair.
    232     * The upper 16 bits of the entry are the upper 16 bits of the
    233     * exclusive primary limit of a range.
    234     * Primaries between the previous limit and this one have their lead bytes
    235     * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
    236     *
    237     * CollationData::makeReorderRanges() writes a full list where the first range
    238     * (at least for terminators and separators) has a 0 offset.
    239     * The last range has a non-zero offset.
    240     * minHighNoReorder is set to the limit of that last range.
    241     *
    242     * In the settings object, the initial ranges before the first split lead byte
    243     * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
    244     * If there are no split-reordered lead bytes, then no ranges are needed.
    245     */
    246    const uint32_t *reorderRanges;
    247    int32_t reorderRangesLength;
    248    /** Array of reorder codes; ignored if reorderCodesLength == 0. */
    249    const int32_t *reorderCodes;
    250    /** Number of reorder codes; 0 if no reordering. */
    251    int32_t reorderCodesLength;
    252    /**
    253     * Capacity of reorderCodes.
    254     * If 0, then the codes, the ranges, and the table are aliases.
    255     * Otherwise, this object owns the memory via the reorderCodes pointer;
    256     * the codes, the ranges, and the table are in the same memory block, in that order.
    257     */
    258    int32_t reorderCodesCapacity;
    259 
    260    /** Options for CollationFastLatin. Negative if disabled. */
    261    int32_t fastLatinOptions;
    262    uint16_t fastLatinPrimaries[0x180];
    263 
    264 private:
    265    void setReorderArrays(const int32_t *codes, int32_t codesLength,
    266                          const uint32_t *ranges, int32_t rangesLength,
    267                          const uint8_t *table, UErrorCode &errorCode);
    268    uint32_t reorderEx(uint32_t p) const;
    269 };
    270 
    271 U_NAMESPACE_END
    272 
    273 #endif  // !UCONFIG_NO_COLLATION
    274 #endif  // __COLLATIONSETTINGS_H__