Collator.h (11026B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 #ifndef intl_components_Collator_h_ 5 #define intl_components_Collator_h_ 6 7 #ifndef JS_STANDALONE 8 # include "gtest/MozGtestFriend.h" 9 #endif 10 11 #include "unicode/ucol.h" 12 13 #include "mozilla/intl/ICU4CGlue.h" 14 #include "mozilla/intl/ICUError.h" 15 #include "mozilla/Result.h" 16 #include "mozilla/Span.h" 17 18 namespace mozilla::intl { 19 20 class Collator final { 21 public: 22 /** 23 * Construct from a raw UCollator. This is public so that the UniquePtr can 24 * access it. 25 */ 26 explicit Collator(UCollator* aCollator); 27 28 // Do not allow copy as this class owns the ICU resource. Move is not 29 // currently implemented, but a custom move operator could be created if 30 // needed. 31 Collator(const Collator&) = delete; 32 Collator& operator=(const Collator&) = delete; 33 34 /** 35 * Attempt to initialize a new collator. 36 */ 37 static Result<UniquePtr<Collator>, ICUError> TryCreate(const char* aLocale); 38 39 ~Collator(); 40 41 /** 42 * Get a sort key with the provided UTF-16 string, and store the sort key into 43 * the provided buffer of byte array. 44 * Every sort key ends with 0x00, and the terminating 0x00 byte is counted 45 * when calculating the length of buffer. For the purpose of other byte 46 * values, check the "Special Byte Values" document from ICU. 47 * 48 * https://icu.unicode.org/design/collation/bytes 49 */ 50 template <typename B> 51 ICUResult GetSortKey(Span<const char16_t> aString, B& aBuffer) const { 52 return FillBufferWithICUCall( 53 aBuffer, 54 [this, aString](uint8_t* target, int32_t length, UErrorCode* status) { 55 // ucol_getSortKey doesn't use the error code to report 56 // U_BUFFER_OVERFLOW_ERROR, instead it uses the return value to 57 // indicate the desired length to store the key. So we update the 58 // UErrorCode accordingly to let FillBufferWithICUCall resize the 59 // buffer. 60 int32_t len = ucol_getSortKey(mCollator.GetConst(), aString.data(), 61 static_cast<int32_t>(aString.size()), 62 target, length); 63 if (len == 0) { 64 // Returns 0 means there's an internal error. 65 *status = U_INTERNAL_PROGRAM_ERROR; 66 } else if (len > length) { 67 *status = U_BUFFER_OVERFLOW_ERROR; 68 } else { 69 *status = U_ZERO_ERROR; 70 } 71 return len; 72 }); 73 } 74 75 int32_t CompareStrings(Span<const char16_t> aSource, 76 Span<const char16_t> aTarget) const; 77 78 int32_t CompareSortKeys(Span<const uint8_t> aKey1, 79 Span<const uint8_t> aKey2) const; 80 81 /** 82 * Determine how casing affects sorting. These options map to ECMA 402 83 * collator options. 84 * 85 * https://tc39.es/ecma402/#sec-initializecollator 86 */ 87 enum class CaseFirst { 88 // Sort upper case first. 89 Upper, 90 // Sort lower case first. 91 Lower, 92 // Orders upper and lower case letters in accordance to their tertiary 93 // weights. 94 False, 95 }; 96 97 /** 98 * Which differences in the strings should lead to differences in collation 99 * comparisons. 100 * 101 * This setting needs to be ECMA 402 compliant. 102 * https://tc39.es/ecma402/#sec-collator-comparestrings 103 */ 104 enum class Sensitivity { 105 // Only strings that differ in base letters compare as unequal. 106 // Examples: a ≠ b, a = á, a = A. 107 Base, 108 // Only strings that differ in base letters or accents and other diacritic 109 // marks compare as unequal. 110 // Examples: a ≠ b, a ≠ á, a = A. 111 Accent, 112 // Only strings that differ in base letters or case compare as unequal. 113 // Examples: a ≠ b, a = á, a ≠ A. 114 Case, 115 // Strings that differ in base letters, accents and other diacritic marks, 116 // or case compare as unequal. Other differences may also be taken into 117 // consideration. 118 // Examples: a ≠ b, a ≠ á, a ≠ A. 119 Variant, 120 }; 121 122 /** 123 * These options map to ECMA 402 collator options. Make sure the defaults map 124 * to the default initialized values of ECMA 402. 125 * 126 * https://tc39.es/ecma402/#sec-initializecollator 127 */ 128 struct Options { 129 Sensitivity sensitivity = Sensitivity::Variant; 130 CaseFirst caseFirst = CaseFirst::False; 131 bool ignorePunctuation = false; 132 bool numeric = false; 133 }; 134 135 /** 136 * Change the configuraton of the options. 137 */ 138 ICUResult SetOptions(const Options& aOptions, 139 const Maybe<Options&> aPrevOptions = Nothing()); 140 141 /** 142 * Return the case first option of this collator. 143 */ 144 Result<CaseFirst, ICUError> GetCaseFirst() const; 145 146 /** 147 * Return the "ignores punctuation" option of this collator. 148 */ 149 Result<bool, ICUError> GetIgnorePunctuation() const; 150 151 /** 152 * Map keywords to their BCP 47 equivalents. 153 */ 154 static SpanResult<char> KeywordValueToBcp47Extension(const char* aKeyword, 155 int32_t aLength); 156 157 enum class CommonlyUsed : bool { 158 /** 159 * Select all possible values, even when not commonly used by a locale. 160 */ 161 No, 162 163 /** 164 * Only select the values which are commonly used by a locale. 165 */ 166 Yes, 167 }; 168 169 using Bcp47ExtEnumeration = 170 Enumeration<char, SpanResult<char>, 171 Collator::KeywordValueToBcp47Extension>; 172 173 /** 174 * Returns an iterator of collator locale extensions in the preferred order. 175 * These extensions can be used in BCP 47 locales. For instance this 176 * iterator could return "phonebk" and could be appled to the German locale 177 * "de" as "de-co-phonebk" for a phonebook-style collation. 178 * 179 * The collation extensions can be found here: 180 * http://cldr.unicode.org/core-spec/#Key_Type_Definitions 181 */ 182 static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValuesForLocale( 183 const char* aLocale, CommonlyUsed aCommonlyUsed = CommonlyUsed::No); 184 185 /** 186 * Returns an iterator over all possible collator locale extensions. 187 * These extensions can be used in BCP 47 locales. For instance this 188 * iterator could return "phonebk" and could be appled to the German locale 189 * "de" as "de-co-phonebk" for a phonebook-style collation. 190 * 191 * The collation extensions can be found here: 192 * http://cldr.unicode.org/core-spec/#Key_Type_Definitions 193 */ 194 static Result<Bcp47ExtEnumeration, ICUError> GetBcp47KeywordValues(); 195 196 /** 197 * Returns an iterator over all supported collator locales. 198 * 199 * The returned strings are ICU locale identifiers and NOT BCP 47 language 200 * tags. 201 * 202 * Also see <https://unicode-org.github.io/icu/userguide/locale>. 203 */ 204 static auto GetAvailableLocales() { 205 return AvailableLocalesEnumeration<ucol_countAvailable, 206 ucol_getAvailable>(); 207 } 208 209 private: 210 /** 211 * Toggle features, or use the default setting. 212 */ 213 enum class Feature { 214 // Turn the feature off. 215 On, 216 // Turn the feature off. 217 Off, 218 // Use the default setting for the feature. 219 Default, 220 }; 221 222 static constexpr auto ToUColAttributeValue(Feature aFeature) { 223 switch (aFeature) { 224 case Collator::Feature::On: 225 return UCOL_ON; 226 case Collator::Feature::Off: 227 return UCOL_OFF; 228 case Collator::Feature::Default: 229 return UCOL_DEFAULT; 230 } 231 MOZ_CRASH("invalid collator feature"); 232 } 233 234 /** 235 * Attribute for handling variable elements. 236 */ 237 enum class AlternateHandling { 238 // Treats all the codepoints with non-ignorable primary weights in the 239 // same way (default) 240 NonIgnorable, 241 // Causes codepoints with primary weights that are equal or below the 242 // variable top value to be ignored on primary level and moved to the 243 // quaternary level. 244 Shifted, 245 Default, 246 }; 247 248 /** 249 * The strength attribute. 250 * 251 * The usual strength for most locales (except Japanese) is tertiary. 252 * 253 * Quaternary strength is useful when combined with shifted setting for 254 * alternate handling attribute and for JIS X 4061 collation, when it is used 255 * to distinguish between Katakana and Hiragana. Otherwise, quaternary level 256 * is affected only by the number of non-ignorable code points in the string. 257 * 258 * Identical strength is rarely useful, as it amounts to codepoints of the NFD 259 * form of the string. 260 */ 261 enum class Strength { 262 // Primary collation strength. 263 Primary, 264 // Secondary collation strength. 265 Secondary, 266 // Tertiary collation strength. 267 Tertiary, 268 // Quaternary collation strength. 269 Quaternary, 270 // Identical collation strength. 271 Identical, 272 Default, 273 }; 274 275 /** 276 * Configure the Collation::Strength 277 */ 278 void SetStrength(Strength strength); 279 280 /** 281 * Configure Collation::AlternateHandling. 282 */ 283 ICUResult SetAlternateHandling(AlternateHandling aAlternateHandling); 284 285 /** 286 * Controls whether an extra case level (positioned before the third level) is 287 * generated or not. 288 * 289 * Contents of the case level are affected by the value of CaseFirst 290 * attribute. A simple way to ignore accent differences in a string is to set 291 * the strength to Primary and enable case level. 292 */ 293 ICUResult SetCaseLevel(Feature aFeature); 294 295 /** 296 * When turned on, this attribute makes substrings of digits sort according to 297 * their numeric values. 298 * 299 * This is a way to get '100' to sort AFTER '2'. Note that the longest digit 300 * substring that can be treated as a single unit is 254 digits (not counting 301 * leading zeros). If a digit substring is longer than that, the digits beyond 302 * the limit will be treated as a separate digit substring. 303 * 304 * A "digit" in this sense is a code point with General_Category=Nd, which 305 * does not include circled numbers, roman numerals, etc. Only a contiguous 306 * digit substring is considered, that is, non-negative integers without 307 * separators. There is no support for plus/minus signs, decimals, exponents, 308 * etc. 309 */ 310 ICUResult SetNumericCollation(Feature aFeature); 311 312 /** 313 * Controls whether the normalization check and necessary normalizations are 314 * performed. 315 * 316 * When off (default), no normalization check is performed. The correctness of 317 * the result is guaranteed only if the input data is in so-called FCD form 318 * When set to on, an incremental check is performed to see whether the input 319 * data is in the FCD form. If the data is not in the FCD form, incremental 320 * NFD normalization is performed. 321 */ 322 ICUResult SetNormalizationMode(Feature aFeature); 323 324 /** 325 * Configure Collation::CaseFirst. 326 */ 327 ICUResult SetCaseFirst(CaseFirst aCaseFirst); 328 329 #ifndef JS_STANDALONE 330 FRIEND_TEST(IntlCollator, SetAttributesInternal); 331 #endif 332 333 ICUPointer<UCollator> mCollator = ICUPointer<UCollator>(nullptr); 334 Maybe<Sensitivity> mLastStrategy = Nothing(); 335 }; 336 337 } // namespace mozilla::intl 338 339 #endif