tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Collator.h (11026B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 #ifndef intl_components_Collator_h_
      5 #define intl_components_Collator_h_
      6 
      7 #ifndef JS_STANDALONE
      8 #  include "gtest/MozGtestFriend.h"
      9 #endif
     10 
     11 #include "unicode/ucol.h"
     12 
     13 #include "mozilla/intl/ICU4CGlue.h"
     14 #include "mozilla/intl/ICUError.h"
     15 #include "mozilla/Result.h"
     16 #include "mozilla/Span.h"
     17 
     18 namespace mozilla::intl {
     19 
     20 class Collator final {
     21 public:
     22  /**
     23   * Construct from a raw UCollator. This is public so that the UniquePtr can
     24   * access it.
     25   */
     26  explicit Collator(UCollator* aCollator);
     27 
     28  // Do not allow copy as this class owns the ICU resource. Move is not
     29  // currently implemented, but a custom move operator could be created if
     30  // needed.
     31  Collator(const Collator&) = delete;
     32  Collator& operator=(const Collator&) = delete;
     33 
     34  /**
     35   * Attempt to initialize a new collator.
     36   */
     37  static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale);
     38 
     39  ~Collator();
     40 
     41  /**
     42   * Get a sort key with the provided UTF-16 string, and store the sort key into
     43   * the provided buffer of byte array.
     44   * Every sort key ends with 0x00, and the terminating 0x00 byte is counted
     45   * when calculating the length of buffer. For the purpose of other byte
     46   * values, check the "Special Byte Values" document from ICU.
     47   *
     48   * https://icu.unicode.org/design/collation/bytes
     49   */
     50  template <typename B>
     51  ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const {
     52    return FillBufferWithICUCall(
     53        aBuffer,
     54        [this, aString](uint8_t* target, int32_t length, UErrorCode* status) {
     55          // ucol_getSortKey doesn't use the error code to report
     56          // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to
     57          // indicate the desired length to store the key. So we update the
     58          // UErrorCode accordingly to let FillBufferWithICUCall resize the
     59          // buffer.
     60          int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(),
     61                                        static_cast<int32_t>(aString.size()),
     62                                        target, length);
     63          if (len == 0) {
     64            // Returns 0 means there's an internal error.
     65            *status = U_INTERNAL_PROGRAM_ERROR;
     66          } else if (len > length) {
     67            *status = U_BUFFER_OVERFLOW_ERROR;
     68          } else {
     69            *status = U_ZERO_ERROR;
     70          }
     71          return len;
     72        });
     73  }
     74 
     75  int32_t CompareStrings(Span<const char16_t> aSource,
     76                         Span<const char16_t> aTarget) const;
     77 
     78  int32_t CompareSortKeys(Span<const uint8_t> aKey1,
     79                          Span<const uint8_t> aKey2) const;
     80 
     81  /**
     82   * Determine how casing affects sorting. These options map to ECMA 402
     83   * collator options.
     84   *
     85   * https://tc39.es/ecma402/#sec-initializecollator
     86   */
     87  enum class CaseFirst {
     88    // Sort upper case first.
     89    Upper,
     90    // Sort lower case first.
     91    Lower,
     92    // Orders upper and lower case letters in accordance to their tertiary
     93    // weights.
     94    False,
     95  };
     96 
     97  /**
     98   * Which differences in the strings should lead to differences in collation
     99   * comparisons.
    100   *
    101   * This setting needs to be ECMA 402 compliant.
    102   * https://tc39.es/ecma402/#sec-collator-comparestrings
    103   */
    104  enum class Sensitivity {
    105    // Only strings that differ in base letters compare as unequal.
    106    // Examples: a ≠ b, a = á, a = A.
    107    Base,
    108    // Only strings that differ in base letters or accents and other diacritic
    109    // marks compare as unequal.
    110    // Examples: a ≠ b, a ≠ á, a = A.
    111    Accent,
    112    // Only strings that differ in base letters or case compare as unequal.
    113    // Examples: a ≠ b, a = á, a ≠ A.
    114    Case,
    115    // Strings that differ in base letters, accents and other diacritic marks,
    116    // or case compare as unequal. Other differences may also be taken into
    117    // consideration.
    118    // Examples: a ≠ b, a ≠ á, a ≠ A.
    119    Variant,
    120  };
    121 
    122  /**
    123   * These options map to ECMA 402 collator options. Make sure the defaults map
    124   * to the default initialized values of ECMA 402.
    125   *
    126   * https://tc39.es/ecma402/#sec-initializecollator
    127   */
    128  struct Options {
    129    Sensitivity sensitivity = Sensitivity::Variant;
    130    CaseFirst caseFirst = CaseFirst::False;
    131    bool ignorePunctuation = false;
    132    bool numeric = false;
    133  };
    134 
    135  /**
    136   * Change the configuraton of the options.
    137   */
    138  ICUResult SetOptions(const Options& aOptions,
    139                       const Maybe<Options&> aPrevOptions = Nothing());
    140 
    141  /**
    142   * Return the case first option of this collator.
    143   */
    144  Result<CaseFirst, ICUError> GetCaseFirst() const;
    145 
    146  /**
    147   * Return the "ignores punctuation" option of this collator.
    148   */
    149  Result<bool, ICUError> GetIgnorePunctuation() const;
    150 
    151  /**
    152   * Map keywords to their BCP 47 equivalents.
    153   */
    154  static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword,
    155                                                       int32_t aLength);
    156 
    157  enum class CommonlyUsed : bool {
    158    /**
    159     * Select all possible values, even when not commonly used by a locale.
    160     */
    161    No,
    162 
    163    /**
    164     * Only select the values which are commonly used by a locale.
    165     */
    166    Yes,
    167  };
    168 
    169  using Bcp47ExtEnumeration =
    170      Enumeration<char, SpanResult<char>,
    171                  Collator::KeywordValueToBcp47Extension>;
    172 
    173  /**
    174   * Returns an iterator of collator locale extensions in the preferred order.
    175   * These extensions can be used in BCP 47 locales. For instance this
    176   * iterator could return "phonebk" and could be appled to the German locale
    177   * "de" as "de-co-phonebk" for a phonebook-style collation.
    178   *
    179   * The collation extensions can be found here:
    180   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
    181   */
    182  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale(
    183      const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No);
    184 
    185  /**
    186   * Returns an iterator over all possible collator locale extensions.
    187   * These extensions can be used in BCP 47 locales. For instance this
    188   * iterator could return "phonebk" and could be appled to the German locale
    189   * "de" as "de-co-phonebk" for a phonebook-style collation.
    190   *
    191   * The collation extensions can be found here:
    192   * http://cldr.unicode.org/core-spec/#Key_Type_Definitions
    193   */
    194  static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues();
    195 
    196  /**
    197   * Returns an iterator over all supported collator locales.
    198   *
    199   * The returned strings are ICU locale identifiers and NOT BCP 47 language
    200   * tags.
    201   *
    202   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
    203   */
    204  static auto GetAvailableLocales() {
    205    return AvailableLocalesEnumeration<ucol_countAvailable,
    206                                       ucol_getAvailable>();
    207  }
    208 
    209 private:
    210  /**
    211   * Toggle features, or use the default setting.
    212   */
    213  enum class Feature {
    214    // Turn the feature off.
    215    On,
    216    // Turn the feature off.
    217    Off,
    218    // Use the default setting for the feature.
    219    Default,
    220  };
    221 
    222  static constexpr auto ToUColAttributeValue(Feature aFeature) {
    223    switch (aFeature) {
    224      case Collator::Feature::On:
    225        return UCOL_ON;
    226      case Collator::Feature::Off:
    227        return UCOL_OFF;
    228      case Collator::Feature::Default:
    229        return UCOL_DEFAULT;
    230    }
    231    MOZ_CRASH("invalid collator feature");
    232  }
    233 
    234  /**
    235   * Attribute for handling variable elements.
    236   */
    237  enum class AlternateHandling {
    238    // Treats all the codepoints with non-ignorable primary weights in the
    239    // same way (default)
    240    NonIgnorable,
    241    // Causes codepoints with primary weights that are equal or below the
    242    // variable top value to be ignored on primary level and moved to the
    243    // quaternary level.
    244    Shifted,
    245    Default,
    246  };
    247 
    248  /**
    249   * The strength attribute.
    250   *
    251   * The usual strength for most locales (except Japanese) is tertiary.
    252   *
    253   * Quaternary strength is useful when combined with shifted setting for
    254   * alternate handling attribute and for JIS X 4061 collation, when it is used
    255   * to distinguish between Katakana and Hiragana. Otherwise, quaternary level
    256   * is affected only by the number of non-ignorable code points in the string.
    257   *
    258   * Identical strength is rarely useful, as it amounts to codepoints of the NFD
    259   * form of the string.
    260   */
    261  enum class Strength {
    262    // Primary collation strength.
    263    Primary,
    264    // Secondary collation strength.
    265    Secondary,
    266    // Tertiary collation strength.
    267    Tertiary,
    268    // Quaternary collation strength.
    269    Quaternary,
    270    // Identical collation strength.
    271    Identical,
    272    Default,
    273  };
    274 
    275  /**
    276   * Configure the Collation::Strength
    277   */
    278  void SetStrength(Strength strength);
    279 
    280  /**
    281   * Configure Collation::AlternateHandling.
    282   */
    283  ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling);
    284 
    285  /**
    286   * Controls whether an extra case level (positioned before the third level) is
    287   * generated or not.
    288   *
    289   * Contents of the case level are affected by the value of CaseFirst
    290   * attribute. A simple way to ignore accent differences in a string is to set
    291   * the strength to Primary and enable case level.
    292   */
    293  ICUResult SetCaseLevel(Feature aFeature);
    294 
    295  /**
    296   * When turned on, this attribute makes substrings of digits sort according to
    297   * their numeric values.
    298   *
    299   * This is a way to get '100' to sort AFTER '2'. Note that the longest digit
    300   * substring that can be treated as a single unit is 254 digits (not counting
    301   * leading zeros). If a digit substring is longer than that, the digits beyond
    302   * the limit will be treated as a separate digit substring.
    303   *
    304   * A "digit" in this sense is a code point with General_Category=Nd, which
    305   * does not include circled numbers, roman numerals, etc. Only a contiguous
    306   * digit substring is considered, that is, non-negative integers without
    307   * separators. There is no support for plus/minus signs, decimals, exponents,
    308   * etc.
    309   */
    310  ICUResult SetNumericCollation(Feature aFeature);
    311 
    312  /**
    313   * Controls whether the normalization check and necessary normalizations are
    314   * performed.
    315   *
    316   * When off (default), no normalization check is performed. The correctness of
    317   * the result is guaranteed only if the input data is in so-called FCD form
    318   * When set to on, an incremental check is performed to see whether the input
    319   * data is in the FCD form. If the data is not in the FCD form, incremental
    320   * NFD normalization is performed.
    321   */
    322  ICUResult SetNormalizationMode(Feature aFeature);
    323 
    324  /**
    325   * Configure Collation::CaseFirst.
    326   */
    327  ICUResult SetCaseFirst(CaseFirst aCaseFirst);
    328 
    329 #ifndef JS_STANDALONE
    330  FRIEND_TEST(IntlCollator, SetAttributesInternal);
    331 #endif
    332 
    333  ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr);
    334  Maybe<Sensitivity> mLastStrategy = Nothing();
    335 };
    336 
    337 }  // namespace mozilla::intl
    338 
    339 #endif