tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SharedIntlData.h (14805B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef builtin_intl_SharedIntlData_h
      8 #define builtin_intl_SharedIntlData_h
      9 
     10 #include "mozilla/MemoryReporting.h"
     11 #include "mozilla/Span.h"
     12 #include "mozilla/UniquePtr.h"
     13 
     14 #include <stddef.h>
     15 
     16 #include "js/AllocPolicy.h"
     17 #include "js/GCAPI.h"
     18 #include "js/GCHashTable.h"
     19 #include "js/Result.h"
     20 #include "js/RootingAPI.h"
     21 #include "js/Utility.h"
     22 #include "vm/StringType.h"
     23 
     24 namespace mozilla::intl {
     25 class DateTimePatternGenerator;
     26 }  // namespace mozilla::intl
     27 
     28 namespace js {
     29 
     30 class ArrayObject;
     31 
     32 namespace intl {
     33 
     34 enum class AvailableLocaleKind {
     35  Collator,
     36  DateTimeFormat,
     37  DisplayNames,
     38  DurationFormat,
     39  ListFormat,
     40  NumberFormat,
     41  PluralRules,
     42  RelativeTimeFormat,
     43  Segmenter,
     44 };
     45 
     46 /**
     47 * This deleter class exists so that mozilla::intl::DateTimePatternGenerator
     48 * can be a forward declaration, but still be used inside of a UniquePtr.
     49 */
     50 class DateTimePatternGeneratorDeleter {
     51 public:
     52  void operator()(mozilla::intl::DateTimePatternGenerator* ptr);
     53 };
     54 
     55 /**
     56 * Stores Intl data which can be shared across compartments (but not contexts).
     57 *
     58 * Used for data which is expensive when computed repeatedly or is not
     59 * available through ICU.
     60 */
     61 class SharedIntlData {
     62  struct LinearStringLookup {
     63    union {
     64      const JS::Latin1Char* latin1Chars;
     65      const char16_t* twoByteChars;
     66    };
     67    bool isLatin1;
     68    size_t length;
     69    JS::AutoCheckCannotGC nogc;
     70    HashNumber hash = 0;
     71 
     72    explicit LinearStringLookup(const JSLinearString* string)
     73        : isLatin1(string->hasLatin1Chars()), length(string->length()) {
     74      if (isLatin1) {
     75        latin1Chars = string->latin1Chars(nogc);
     76      } else {
     77        twoByteChars = string->twoByteChars(nogc);
     78      }
     79    }
     80 
     81    LinearStringLookup(const char* chars, size_t length)
     82        : isLatin1(true), length(length) {
     83      latin1Chars = reinterpret_cast<const JS::Latin1Char*>(chars);
     84    }
     85 
     86    LinearStringLookup(const char16_t* chars, size_t length)
     87        : isLatin1(false), length(length) {
     88      twoByteChars = chars;
     89    }
     90  };
     91 
     92 public:
     93  /**
     94   * Information tracking the set of the supported time zone names, derived
     95   * from the IANA time zone database <https://www.iana.org/time-zones>.
     96   *
     97   * There are two kinds of IANA time zone names: Zone and Link (denoted as
     98   * such in database source files). Zone names are the canonical, preferred
     99   * name for a time zone, e.g. Asia/Kolkata. Link names simply refer to
    100   * target Zone names for their meaning, e.g. Asia/Calcutta targets
    101   * Asia/Kolkata. That a name is a Link doesn't *necessarily* reflect a
    102   * sense of deprecation: some Link names also exist partly for convenience,
    103   * e.g. UTC and GMT as Link names targeting the Zone name Etc/UTC.
    104   *
    105   * Two data sources determine the time zone names we support: those ICU
    106   * supports and IANA's zone information.
    107   *
    108   * Unfortunately the names ICU and IANA support, and their Link
    109   * relationships from name to target, aren't identical, so we can't simply
    110   * implicitly trust ICU's name handling. We must perform various
    111   * preprocessing of user-provided zone names and post-processing of
    112   * ICU-provided zone names to implement ECMA-402's IANA-consistent behavior.
    113   *
    114   * Also see <https://ssl.icu-project.org/trac/ticket/12044> and
    115   * <http://unicode.org/cldr/trac/ticket/9892>.
    116   */
    117 
    118  using TimeZoneName = JSAtom*;
    119 
    120  struct AvailableTimeZoneHasher {
    121    struct Lookup : LinearStringLookup {
    122      explicit Lookup(const JSLinearString* timeZone);
    123      Lookup(const char* chars, size_t length);
    124      Lookup(const char16_t* chars, size_t length);
    125    };
    126 
    127    static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; }
    128    static bool match(TimeZoneName key, const Lookup& lookup);
    129  };
    130 
    131  struct TimeZoneHasher {
    132    using Lookup = TimeZoneName;
    133 
    134    static js::HashNumber hash(const Lookup& lookup) { return lookup->hash(); }
    135    static bool match(TimeZoneName key, const Lookup& lookup) {
    136      return key == lookup;
    137    }
    138  };
    139 
    140  using AvailableTimeZoneSet =
    141      GCHashSet<TimeZoneName, AvailableTimeZoneHasher, SystemAllocPolicy>;
    142  using TimeZoneSet =
    143      GCHashSet<TimeZoneName, TimeZoneHasher, SystemAllocPolicy>;
    144  using TimeZoneMap =
    145      GCHashMap<TimeZoneName, TimeZoneName, TimeZoneHasher, SystemAllocPolicy>;
    146 
    147 private:
    148  /**
    149   * As a threshold matter, available time zones are those time zones ICU
    150   * supports, via ucal_openTimeZones. But ICU supports additional non-IANA
    151   * time zones described in intl/icu/source/tools/tzcode/icuzones (listed in
    152   * TimeZoneDataGenerated.h's |legacyICUTimeZones|) for its own backwards
    153   * compatibility purposes. This set consists of ICU's supported time zones,
    154   * minus all backwards-compatibility time zones.
    155   */
    156  AvailableTimeZoneSet availableTimeZones;
    157 
    158  /**
    159   * IANA treats some time zone names as Zones, that ICU instead treats as
    160   * Links. For example, IANA considers "America/Indiana/Indianapolis" to be
    161   * a Zone and "America/Fort_Wayne" a Link that targets it, but ICU
    162   * considers the former a Link that targets "America/Indianapolis" (which
    163   * IANA treats as a Link).
    164   *
    165   * ECMA-402 requires that we respect IANA data, so if we're asked to
    166   * canonicalize a time zone name in this set, we must *not* return ICU's
    167   * canonicalization.
    168   */
    169  TimeZoneSet ianaZonesTreatedAsLinksByICU;
    170 
    171  /**
    172   * IANA treats some time zone names as Links to one target, that ICU
    173   * instead treats as either Zones, or Links to different targets. An
    174   * example of the former is "Asia/Calcutta, which IANA assigns the target
    175   * "Asia/Kolkata" but ICU considers its own Zone. An example of the latter
    176   * is "US/East-Indiana", which IANA assigns the target
    177   * "America/Indiana/Indianapolis" but ICU assigns the target
    178   * "America/Indianapolis".
    179   *
    180   * ECMA-402 requires that we respect IANA data, so if we're asked to
    181   * canonicalize a time zone name that's a key in this map, we *must* return
    182   * the corresponding value and *must not* return ICU's canonicalization.
    183   */
    184  TimeZoneMap ianaLinksCanonicalizedDifferentlyByICU;
    185 
    186  bool timeZoneDataInitialized = false;
    187 
    188  /**
    189   * Precomputes the available time zone names, because it's too expensive to
    190   * call ucal_openTimeZones() repeatedly.
    191   */
    192  bool ensureTimeZones(JSContext* cx);
    193 
    194  /**
    195   * Returns the canonical time zone name. |availableTimeZone| must be an
    196   * available time zone name. If no canonical name was found, returns
    197   * |nullptr|.
    198   *
    199   * This method only handles time zones which are canonicalized differently
    200   * by ICU when compared to IANA.
    201   */
    202  JSAtom* tryCanonicalizeTimeZoneConsistentWithIANA(JSAtom* availableTimeZone);
    203 
    204  /**
    205   * Returns the canonical time zone name. |availableTimeZone| must be an
    206   * available time zone name.
    207   */
    208  JSAtom* canonicalizeAvailableTimeZone(JSContext* cx,
    209                                        JS::Handle<JSAtom*> availableTimeZone);
    210 
    211  /**
    212   * Validates and canonicalizes a time zone name. Returns the case-normalized
    213   * identifier in |identifier| and its primary time zone in |primary|. If the
    214   * input time zone isn't a valid IANA time zone name, |identifier| and
    215   * |primary| both remain unchanged.
    216   */
    217  bool validateAndCanonicalizeTimeZone(
    218      JSContext* cx, const AvailableTimeZoneSet::Lookup& lookup,
    219      JS::MutableHandle<JSAtom*> identifier,
    220      JS::MutableHandle<JSAtom*> primary);
    221 
    222 public:
    223  /**
    224   * Returns the canonical time zone name. |timeZone| must be a valid time zone
    225   * name.
    226   */
    227  JSLinearString* canonicalizeTimeZone(JSContext* cx,
    228                                       JS::Handle<JSLinearString*> timeZone);
    229 
    230  /**
    231   * Validates and canonicalizes a time zone name. Returns the case-normalized
    232   * identifier in |identifier| and its primary time zone in |primary|. If the
    233   * input time zone isn't a valid IANA time zone name, |identifier| and
    234   * |primary| both remain unchanged.
    235   */
    236  bool validateAndCanonicalizeTimeZone(JSContext* cx,
    237                                       JS::Handle<JSLinearString*> timeZone,
    238                                       JS::MutableHandle<JSAtom*> identifier,
    239                                       JS::MutableHandle<JSAtom*> primary);
    240 
    241  /**
    242   * Validates and canonicalizes a time zone name. Returns the case-normalized
    243   * identifier in |identifier| and its primary time zone in |primary|. If the
    244   * input time zone isn't a valid IANA time zone name, |identifier| and
    245   * |primary| both remain unchanged.
    246   */
    247  bool validateAndCanonicalizeTimeZone(JSContext* cx,
    248                                       mozilla::Span<const char> timeZone,
    249                                       JS::MutableHandle<JSAtom*> identifier,
    250                                       JS::MutableHandle<JSAtom*> primary);
    251 
    252  /**
    253   * Returns an iterator over all available time zones supported by ICU. The
    254   * returned time zone names aren't canonicalized.
    255   */
    256  JS::Result<AvailableTimeZoneSet::Iterator> availableTimeZonesIteration(
    257      JSContext* cx);
    258 
    259 private:
    260  using Locale = JSAtom*;
    261 
    262  struct LocaleHasher {
    263    struct Lookup : LinearStringLookup {
    264      explicit Lookup(const JSLinearString* locale);
    265      Lookup(const char* chars, size_t length);
    266    };
    267 
    268    static js::HashNumber hash(const Lookup& lookup) { return lookup.hash; }
    269    static bool match(Locale key, const Lookup& lookup);
    270  };
    271 
    272  using LocaleSet = GCHashSet<Locale, LocaleHasher, SystemAllocPolicy>;
    273 
    274  // Set of available locales for all Intl service constructors except Collator,
    275  // which uses its own set.
    276  //
    277  // UDateFormat:
    278  // udat_[count,get]Available() return the same results as their
    279  // uloc_[count,get]Available() counterparts.
    280  //
    281  // UNumberFormatter:
    282  // unum_[count,get]Available() return the same results as their
    283  // uloc_[count,get]Available() counterparts.
    284  //
    285  // UListFormatter, UPluralRules, and URelativeDateTimeFormatter:
    286  // We're going to use ULocale availableLocales as per ICU recommendation:
    287  // https://unicode-org.atlassian.net/browse/ICU-12756
    288  LocaleSet availableLocales;
    289 
    290  // ucol_[count,get]Available() return different results compared to
    291  // uloc_[count,get]Available(), we can't use |availableLocales| here.
    292  LocaleSet collatorAvailableLocales;
    293 
    294  bool availableLocalesInitialized = false;
    295 
    296  // CountAvailable and GetAvailable describe the signatures used for ICU API
    297  // to determine available locales for various functionality.
    298  using CountAvailable = int32_t (*)();
    299  using GetAvailable = const char* (*)(int32_t localeIndex);
    300 
    301  template <class AvailableLocales>
    302  static bool getAvailableLocales(JSContext* cx, LocaleSet& locales,
    303                                  const AvailableLocales& availableLocales);
    304 
    305  /**
    306   * Precomputes the available locales sets.
    307   */
    308  bool ensureAvailableLocales(JSContext* cx);
    309 
    310 public:
    311  /**
    312   * Sets |available| to true if |locale| is supported by the requested Intl
    313   * service constructor. Otherwise sets |available| to false.
    314   */
    315  [[nodiscard]] bool isAvailableLocale(JSContext* cx, AvailableLocaleKind kind,
    316                                       JS::Handle<JSLinearString*> locale,
    317                                       bool* available);
    318 
    319  /**
    320   * Returns all available locales for |kind|.
    321   */
    322  ArrayObject* availableLocalesOf(JSContext* cx, AvailableLocaleKind kind);
    323 
    324 private:
    325  /**
    326   * The case first parameter (BCP47 key "kf") allows to switch the order of
    327   * upper- and lower-case characters. ICU doesn't directly provide an API
    328   * to query the default case first value of a given locale, but instead
    329   * requires to instantiate a collator object and then query the case first
    330   * attribute (UCOL_CASE_FIRST).
    331   * To avoid instantiating an additional collator object whenever we need
    332   * to retrieve the default case first value of a specific locale, we
    333   * compute the default case first value for every supported locale only
    334   * once and then keep a list of all locales which don't use the default
    335   * case first setting.
    336   * There is almost no difference between lower-case first and when case
    337   * first is disabled (UCOL_LOWER_FIRST resp. UCOL_OFF), so we only need to
    338   * track locales which use upper-case first as their default setting.
    339   *
    340   * Instantiating collator objects for each available locale is slow
    341   * (bug 1527879), therefore we're hardcoding the two locales using upper-case
    342   * first ("da" (Danish) and "mt" (Maltese)) and only assert in debug-mode
    343   * these two locales match the upper-case first locales returned by ICU. A
    344   * system-ICU may support a different set of locales, therefore we're always
    345   * calling into ICU to find the upper-case first locales in that case.
    346   */
    347 
    348 #if DEBUG || MOZ_SYSTEM_ICU
    349  LocaleSet upperCaseFirstLocales;
    350 
    351  bool upperCaseFirstInitialized = false;
    352 
    353  /**
    354   * Precomputes the available locales which use upper-case first sorting.
    355   */
    356  bool ensureUpperCaseFirstLocales(JSContext* cx);
    357 #endif
    358 
    359 public:
    360  /**
    361   * Sets |isUpperFirst| to true if |locale| sorts upper-case characters
    362   * before lower-case characters.
    363   */
    364  bool isUpperCaseFirst(JSContext* cx, JS::Handle<JSLinearString*> locale,
    365                        bool* isUpperFirst);
    366 
    367 private:
    368 #if DEBUG || MOZ_SYSTEM_ICU
    369  LocaleSet ignorePunctuationLocales;
    370 
    371  bool ignorePunctuationInitialized = false;
    372 
    373  /**
    374   * Precomputes the available locales which ignore punctuation.
    375   */
    376  bool ensureIgnorePunctuationLocales(JSContext* cx);
    377 #endif
    378 
    379 public:
    380  /**
    381   * Sets |ignorePunctuation| to true if |locale| ignores punctuation.
    382   */
    383  bool isIgnorePunctuation(JSContext* cx, JS::Handle<JSLinearString*> locale,
    384                           bool* ignorePunctuation);
    385 
    386 private:
    387  using UniqueDateTimePatternGenerator =
    388      mozilla::UniquePtr<mozilla::intl::DateTimePatternGenerator,
    389                         DateTimePatternGeneratorDeleter>;
    390 
    391  UniqueDateTimePatternGenerator dateTimePatternGenerator;
    392  JS::UniqueChars dateTimePatternGeneratorLocale;
    393 
    394 public:
    395  /**
    396   * Get a non-owned cached instance of the DateTimePatternGenerator, which is
    397   * expensive to instantiate.
    398   *
    399   * See: https://bugzilla.mozilla.org/show_bug.cgi?id=1549578
    400   */
    401  mozilla::intl::DateTimePatternGenerator* getDateTimePatternGenerator(
    402      JSContext* cx, const char* locale);
    403 
    404 public:
    405  void destroyInstance();
    406 
    407  void trace(JSTracer* trc);
    408 
    409  size_t sizeOfExcludingThis(mozilla::MallocSizeOf mallocSizeOf) const;
    410 };
    411 
    412 }  // namespace intl
    413 
    414 }  // namespace js
    415 
    416 #endif /* builtin_intl_SharedIntlData_h */