SharedIntlData.h (14805B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef builtin_intl_SharedIntlData_h 8 #define builtin_intl_SharedIntlData_h 9 10 #include "mozilla/MemoryReporting.h" 11 #include "mozilla/Span.h" 12 #include "mozilla/UniquePtr.h" 13 14 #include <stddef.h> 15 16 #include "js/AllocPolicy.h" 17 #include "js/GCAPI.h" 18 #include "js/GCHashTable.h" 19 #include "js/Result.h" 20 #include "js/RootingAPI.h" 21 #include "js/Utility.h" 22 #include "vm/StringType.h" 23 24 namespace mozilla::intl { 25 class DateTimePatternGenerator; 26 } // namespace mozilla::intl 27 28 namespace js { 29 30 class ArrayObject; 31 32 namespace intl { 33 34 enum class AvailableLocaleKind { 35 Collator, 36 DateTimeFormat, 37 DisplayNames, 38 DurationFormat, 39 ListFormat, 40 NumberFormat, 41 PluralRules, 42 RelativeTimeFormat, 43 Segmenter, 44 }; 45 46 /** 47 * This deleter class exists so that mozilla::intl::DateTimePatternGenerator 48 * can be a forward declaration, but still be used inside of a UniquePtr. 49 */ 50 class DateTimePatternGeneratorDeleter { 51 public: 52 void operator()(mozilla::intl::DateTimePatternGenerator* ptr); 53 }; 54 55 /** 56 * Stores Intl data which can be shared across compartments (but not contexts). 57 * 58 * Used for data which is expensive when computed repeatedly or is not 59 * available through ICU. 60 */ 61 class SharedIntlData { 62 struct LinearStringLookup { 63 union { 64 const JS::Latin1Char* latin1Chars; 65 const char16_t* twoByteChars; 66 }; 67 bool isLatin1; 68 size_t length; 69 JS::AutoCheckCannotGC nogc; 70 HashNumber hash = 0; 71 72 explicit LinearStringLookup(const JSLinearString* string) 73 : isLatin1(string->hasLatin1Chars()), length(string->length()) { 74 if (isLatin1) { 75 latin1Chars = string->latin1Chars(nogc); 76 } else { 77 twoByteChars = string->twoByteChars(nogc); 78 } 79 } 80 81 LinearStringLookup(const char* chars, size_t length) 82 : isLatin1(true), length(length) { 83 latin1Chars = reinterpret_cast<const JS::Latin1Char*>(chars); 84 } 85 86 LinearStringLookup(const char16_t* chars, size_t length) 87 : isLatin1(false), length(length) { 88 twoByteChars = chars; 89 } 90 }; 91 92 public: 93 /** 94 * Information tracking the set of the supported time zone names, derived 95 * from the IANA time zone database <https://www.iana.org/time-zones>. 96 * 97 * There are two kinds of IANA time zone names: Zone and Link (denoted as 98 * such in database source files). Zone names are the canonical, preferred 99 * name for a time zone, e.g. Asia/Kolkata. Link names simply refer to 100 * target Zone names for their meaning, e.g. Asia/Calcutta targets 101 * Asia/Kolkata. That a name is a Link doesn't *necessarily* reflect a 102 * sense of deprecation: some Link names also exist partly for convenience, 103 * e.g. UTC and GMT as Link names targeting the Zone name Etc/UTC. 104 * 105 * Two data sources determine the time zone names we support: those ICU 106 * supports and IANA's zone information. 107 * 108 * Unfortunately the names ICU and IANA support, and their Link 109 * relationships from name to target, aren't identical, so we can't simply 110 * implicitly trust ICU's name handling. We must perform various 111 * preprocessing of user-provided zone names and post-processing of 112 * ICU-provided zone names to implement ECMA-402's IANA-consistent behavior. 113 * 114 * Also see <https://ssl.icu-project.org/trac/ticket/12044> and 115 * <http://unicode.org/cldr/trac/ticket/9892>. 116 */ 117 118 using TimeZoneName = JSAtom*; 119 120 struct AvailableTimeZoneHasher { 121 struct Lookup : LinearStringLookup { 122 explicit Lookup(const JSLinearString* timeZone); 123 Lookup(const char* chars, size_t length); 124 Lookup(const char16_t* chars, size_t length); 125 }; 126 127 static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } 128 static bool match(TimeZoneName key, const Lookup& lookup); 129 }; 130 131 struct TimeZoneHasher { 132 using Lookup = TimeZoneName; 133 134 static js::HashNumber hash(const Lookup& lookup) { return lookup->hash(); } 135 static bool match(TimeZoneName key, const Lookup& lookup) { 136 return key == lookup; 137 } 138 }; 139 140 using AvailableTimeZoneSet = 141 GCHashSet<TimeZoneName, AvailableTimeZoneHasher, SystemAllocPolicy>; 142 using TimeZoneSet = 143 GCHashSet<TimeZoneName, TimeZoneHasher, SystemAllocPolicy>; 144 using TimeZoneMap = 145 GCHashMap<TimeZoneName, TimeZoneName, TimeZoneHasher, SystemAllocPolicy>; 146 147 private: 148 /** 149 * As a threshold matter, available time zones are those time zones ICU 150 * supports, via ucal_openTimeZones. But ICU supports additional non-IANA 151 * time zones described in intl/icu/source/tools/tzcode/icuzones (listed in 152 * TimeZoneDataGenerated.h's |legacyICUTimeZones|) for its own backwards 153 * compatibility purposes. This set consists of ICU's supported time zones, 154 * minus all backwards-compatibility time zones. 155 */ 156 AvailableTimeZoneSet availableTimeZones; 157 158 /** 159 * IANA treats some time zone names as Zones, that ICU instead treats as 160 * Links. For example, IANA considers "America/Indiana/Indianapolis" to be 161 * a Zone and "America/Fort_Wayne" a Link that targets it, but ICU 162 * considers the former a Link that targets "America/Indianapolis" (which 163 * IANA treats as a Link). 164 * 165 * ECMA-402 requires that we respect IANA data, so if we're asked to 166 * canonicalize a time zone name in this set, we must *not* return ICU's 167 * canonicalization. 168 */ 169 TimeZoneSet ianaZonesTreatedAsLinksByICU; 170 171 /** 172 * IANA treats some time zone names as Links to one target, that ICU 173 * instead treats as either Zones, or Links to different targets. An 174 * example of the former is "Asia/Calcutta, which IANA assigns the target 175 * "Asia/Kolkata" but ICU considers its own Zone. An example of the latter 176 * is "US/East-Indiana", which IANA assigns the target 177 * "America/Indiana/Indianapolis" but ICU assigns the target 178 * "America/Indianapolis". 179 * 180 * ECMA-402 requires that we respect IANA data, so if we're asked to 181 * canonicalize a time zone name that's a key in this map, we *must* return 182 * the corresponding value and *must not* return ICU's canonicalization. 183 */ 184 TimeZoneMap ianaLinksCanonicalizedDifferentlyByICU; 185 186 bool timeZoneDataInitialized = false; 187 188 /** 189 * Precomputes the available time zone names, because it's too expensive to 190 * call ucal_openTimeZones() repeatedly. 191 */ 192 bool ensureTimeZones(JSContext* cx); 193 194 /** 195 * Returns the canonical time zone name. |availableTimeZone| must be an 196 * available time zone name. If no canonical name was found, returns 197 * |nullptr|. 198 * 199 * This method only handles time zones which are canonicalized differently 200 * by ICU when compared to IANA. 201 */ 202 JSAtom* tryCanonicalizeTimeZoneConsistentWithIANA(JSAtom* availableTimeZone); 203 204 /** 205 * Returns the canonical time zone name. |availableTimeZone| must be an 206 * available time zone name. 207 */ 208 JSAtom* canonicalizeAvailableTimeZone(JSContext* cx, 209 JS::Handle<JSAtom*> availableTimeZone); 210 211 /** 212 * Validates and canonicalizes a time zone name. Returns the case-normalized 213 * identifier in |identifier| and its primary time zone in |primary|. If the 214 * input time zone isn't a valid IANA time zone name, |identifier| and 215 * |primary| both remain unchanged. 216 */ 217 bool validateAndCanonicalizeTimeZone( 218 JSContext* cx, const AvailableTimeZoneSet::Lookup& lookup, 219 JS::MutableHandle<JSAtom*> identifier, 220 JS::MutableHandle<JSAtom*> primary); 221 222 public: 223 /** 224 * Returns the canonical time zone name. |timeZone| must be a valid time zone 225 * name. 226 */ 227 JSLinearString* canonicalizeTimeZone(JSContext* cx, 228 JS::Handle<JSLinearString*> timeZone); 229 230 /** 231 * Validates and canonicalizes a time zone name. Returns the case-normalized 232 * identifier in |identifier| and its primary time zone in |primary|. If the 233 * input time zone isn't a valid IANA time zone name, |identifier| and 234 * |primary| both remain unchanged. 235 */ 236 bool validateAndCanonicalizeTimeZone(JSContext* cx, 237 JS::Handle<JSLinearString*> timeZone, 238 JS::MutableHandle<JSAtom*> identifier, 239 JS::MutableHandle<JSAtom*> primary); 240 241 /** 242 * Validates and canonicalizes a time zone name. Returns the case-normalized 243 * identifier in |identifier| and its primary time zone in |primary|. If the 244 * input time zone isn't a valid IANA time zone name, |identifier| and 245 * |primary| both remain unchanged. 246 */ 247 bool validateAndCanonicalizeTimeZone(JSContext* cx, 248 mozilla::Span<const char> timeZone, 249 JS::MutableHandle<JSAtom*> identifier, 250 JS::MutableHandle<JSAtom*> primary); 251 252 /** 253 * Returns an iterator over all available time zones supported by ICU. The 254 * returned time zone names aren't canonicalized. 255 */ 256 JS::Result<AvailableTimeZoneSet::Iterator> availableTimeZonesIteration( 257 JSContext* cx); 258 259 private: 260 using Locale = JSAtom*; 261 262 struct LocaleHasher { 263 struct Lookup : LinearStringLookup { 264 explicit Lookup(const JSLinearString* locale); 265 Lookup(const char* chars, size_t length); 266 }; 267 268 static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; } 269 static bool match(Locale key, const Lookup& lookup); 270 }; 271 272 using LocaleSet = GCHashSet<Locale, LocaleHasher, SystemAllocPolicy>; 273 274 // Set of available locales for all Intl service constructors except Collator, 275 // which uses its own set. 276 // 277 // UDateFormat: 278 // udat_[count,get]Available() return the same results as their 279 // uloc_[count,get]Available() counterparts. 280 // 281 // UNumberFormatter: 282 // unum_[count,get]Available() return the same results as their 283 // uloc_[count,get]Available() counterparts. 284 // 285 // UListFormatter, UPluralRules, and URelativeDateTimeFormatter: 286 // We're going to use ULocale availableLocales as per ICU recommendation: 287 // https://unicode-org.atlassian.net/browse/ICU-12756 288 LocaleSet availableLocales; 289 290 // ucol_[count,get]Available() return different results compared to 291 // uloc_[count,get]Available(), we can't use |availableLocales| here. 292 LocaleSet collatorAvailableLocales; 293 294 bool availableLocalesInitialized = false; 295 296 // CountAvailable and GetAvailable describe the signatures used for ICU API 297 // to determine available locales for various functionality. 298 using CountAvailable = int32_t (*)(); 299 using GetAvailable = const char* (*)(int32_t localeIndex); 300 301 template <class AvailableLocales> 302 static bool getAvailableLocales(JSContext* cx, LocaleSet& locales, 303 const AvailableLocales& availableLocales); 304 305 /** 306 * Precomputes the available locales sets. 307 */ 308 bool ensureAvailableLocales(JSContext* cx); 309 310 public: 311 /** 312 * Sets |available| to true if |locale| is supported by the requested Intl 313 * service constructor. Otherwise sets |available| to false. 314 */ 315 [[nodiscard]] bool isAvailableLocale(JSContext* cx, AvailableLocaleKind kind, 316 JS::Handle<JSLinearString*> locale, 317 bool* available); 318 319 /** 320 * Returns all available locales for |kind|. 321 */ 322 ArrayObject* availableLocalesOf(JSContext* cx, AvailableLocaleKind kind); 323 324 private: 325 /** 326 * The case first parameter (BCP47 key "kf") allows to switch the order of 327 * upper- and lower-case characters. ICU doesn't directly provide an API 328 * to query the default case first value of a given locale, but instead 329 * requires to instantiate a collator object and then query the case first 330 * attribute (UCOL_CASE_FIRST). 331 * To avoid instantiating an additional collator object whenever we need 332 * to retrieve the default case first value of a specific locale, we 333 * compute the default case first value for every supported locale only 334 * once and then keep a list of all locales which don't use the default 335 * case first setting. 336 * There is almost no difference between lower-case first and when case 337 * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to 338 * track locales which use upper-case first as their default setting. 339 * 340 * Instantiating collator objects for each available locale is slow 341 * (bug 1527879), therefore we're hardcoding the two locales using upper-case 342 * first ("da" (Danish) and "mt" (Maltese)) and only assert in debug-mode 343 * these two locales match the upper-case first locales returned by ICU. A 344 * system-ICU may support a different set of locales, therefore we're always 345 * calling into ICU to find the upper-case first locales in that case. 346 */ 347 348 #if DEBUG || MOZ_SYSTEM_ICU 349 LocaleSet upperCaseFirstLocales; 350 351 bool upperCaseFirstInitialized = false; 352 353 /** 354 * Precomputes the available locales which use upper-case first sorting. 355 */ 356 bool ensureUpperCaseFirstLocales(JSContext* cx); 357 #endif 358 359 public: 360 /** 361 * Sets |isUpperFirst| to true if |locale| sorts upper-case characters 362 * before lower-case characters. 363 */ 364 bool isUpperCaseFirst(JSContext* cx, JS::Handle<JSLinearString*> locale, 365 bool* isUpperFirst); 366 367 private: 368 #if DEBUG || MOZ_SYSTEM_ICU 369 LocaleSet ignorePunctuationLocales; 370 371 bool ignorePunctuationInitialized = false; 372 373 /** 374 * Precomputes the available locales which ignore punctuation. 375 */ 376 bool ensureIgnorePunctuationLocales(JSContext* cx); 377 #endif 378 379 public: 380 /** 381 * Sets |ignorePunctuation| to true if |locale| ignores punctuation. 382 */ 383 bool isIgnorePunctuation(JSContext* cx, JS::Handle<JSLinearString*> locale, 384 bool* ignorePunctuation); 385 386 private: 387 using UniqueDateTimePatternGenerator = 388 mozilla::UniquePtr<mozilla::intl::DateTimePatternGenerator, 389 DateTimePatternGeneratorDeleter>; 390 391 UniqueDateTimePatternGenerator dateTimePatternGenerator; 392 JS::UniqueChars dateTimePatternGeneratorLocale; 393 394 public: 395 /** 396 * Get a non-owned cached instance of the DateTimePatternGenerator, which is 397 * expensive to instantiate. 398 * 399 * See: https://bugzilla.mozilla.org/show_bug.cgi?id=1549578 400 */ 401 mozilla::intl::DateTimePatternGenerator* getDateTimePatternGenerator( 402 JSContext* cx, const char* locale); 403 404 public: 405 void destroyInstance(); 406 407 void trace(JSTracer* trc); 408 409 size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const; 410 }; 411 412 } // namespace intl 413 414 } // namespace js 415 416 #endif /* builtin_intl_SharedIntlData_h */