tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsLanguageAtomService.cpp (8750B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #include "nsLanguageAtomService.h"
      7 
      8 #include "mozilla/Encoding.h"
      9 #include "mozilla/intl/Locale.h"
     10 #include "mozilla/intl/OSPreferences.h"
     11 #include "MainThreadUtils.h"
     12 #include "nsGkAtoms.h"
     13 #include "nsUConvPropertySearch.h"
     14 #include "nsUnicharUtils.h"
     15 #include "MainThreadUtils.h"
     16 
     17 #include <mutex>  // for call_once
     18 
     19 using namespace mozilla;
     20 using mozilla::intl::OSPreferences;
     21 
     22 // List of mozilla internal x-* tags that map to themselves (see bug 256257)
     23 static constexpr nsStaticAtom* kLangGroups[] = {
     24    // This list must be sorted!
     25    nsGkAtoms::x_armn,  nsGkAtoms::x_cyrillic, nsGkAtoms::x_devanagari,
     26    nsGkAtoms::x_geor,  nsGkAtoms::x_math,     nsGkAtoms::x_tamil,
     27    nsGkAtoms::Unicode, nsGkAtoms::x_western
     28    // These self-mappings are not necessary unless somebody use them to specify
     29    // lang in (X)HTML/XML documents, which they shouldn't. (see bug 256257)
     30    // x-beng=x-beng
     31    // x-cans=x-cans
     32    // x-ethi=x-ethi
     33    // x-guru=x-guru
     34    // x-gujr=x-gujr
     35    // x-khmr=x-khmr
     36    // x-mlym=x-mlym
     37 };
     38 
     39 // Map ISO 15924 script codes from BCP47 lang tag to mozilla's langGroups.
     40 static constexpr struct {
     41  const char* mTag;
     42  nsStaticAtom* mAtom;
     43 } kScriptLangGroup[] = {
     44    // This list must be sorted by script code!
     45    {"Arab", nsGkAtoms::ar},
     46    {"Armn", nsGkAtoms::x_armn},
     47    {"Beng", nsGkAtoms::x_beng},
     48    {"Cans", nsGkAtoms::x_cans},
     49    {"Cyrl", nsGkAtoms::x_cyrillic},
     50    {"Deva", nsGkAtoms::x_devanagari},
     51    {"Ethi", nsGkAtoms::x_ethi},
     52    {"Geok", nsGkAtoms::x_geor},
     53    {"Geor", nsGkAtoms::x_geor},
     54    {"Grek", nsGkAtoms::el},
     55    {"Gujr", nsGkAtoms::x_gujr},
     56    {"Guru", nsGkAtoms::x_guru},
     57    {"Hang", nsGkAtoms::ko},
     58    // Hani is not mapped to a specific langGroup, we prefer to look at the
     59    // primary language subtag in this case
     60    {"Hans", nsGkAtoms::Chinese},
     61    // Hant is special-cased in code
     62    // Hant=zh-HK
     63    // Hant=zh-TW
     64    {"Hebr", nsGkAtoms::he},
     65    {"Hira", nsGkAtoms::Japanese},
     66    {"Jpan", nsGkAtoms::Japanese},
     67    {"Kana", nsGkAtoms::Japanese},
     68    {"Khmr", nsGkAtoms::x_khmr},
     69    {"Knda", nsGkAtoms::x_knda},
     70    {"Kore", nsGkAtoms::ko},
     71    {"Latn", nsGkAtoms::x_western},
     72    {"Mlym", nsGkAtoms::x_mlym},
     73    {"Orya", nsGkAtoms::x_orya},
     74    {"Sinh", nsGkAtoms::x_sinh},
     75    {"Taml", nsGkAtoms::x_tamil},
     76    {"Telu", nsGkAtoms::x_telu},
     77    {"Thai", nsGkAtoms::th},
     78    {"Tibt", nsGkAtoms::x_tibt}};
     79 
     80 StaticAutoPtr<nsLanguageAtomService> nsLanguageAtomService::sLangAtomService;
     81 
     82 // static
     83 nsLanguageAtomService* nsLanguageAtomService::GetService() {
     84  static std::once_flag sOnce;
     85 
     86  std::call_once(sOnce,
     87                 []() { sLangAtomService = new nsLanguageAtomService(); });
     88 
     89  return sLangAtomService.get();
     90 }
     91 
     92 // static
     93 void nsLanguageAtomService::Shutdown() {
     94  // We only expect to be shut down by the main thread.
     95  MOZ_ASSERT(NS_IsMainThread());
     96  sLangAtomService = nullptr;
     97 }
     98 
     99 nsStaticAtom* nsLanguageAtomService::LookupLanguage(
    100    const nsACString& aLanguage) {
    101  nsAutoCString lowered(aLanguage);
    102  ToLowerCase(lowered);
    103 
    104  RefPtr<nsAtom> lang = NS_Atomize(lowered);
    105  return GetLanguageGroup(lang);
    106 }
    107 
    108 nsAtom* nsLanguageAtomService::GetLocaleLanguage() {
    109  {
    110    AutoReadLock lock(mLock);
    111    if (mLocaleLanguage) {
    112      return mLocaleLanguage;
    113    }
    114  }
    115 
    116  AutoWriteLock lock(mLock);
    117  if (!mLocaleLanguage) {
    118    AutoTArray<nsCString, 10> regionalPrefsLocales;
    119    // XXX Are the OSPreferences calls here safe to call from any thread?
    120    // In practice GetLocaleLanguage will be called early on the main thread
    121    // (e.g. by nsFontCache), so mLocaleLanguage should be safely initialized
    122    // before we try to use it from worker threads, but that may not be fully
    123    // guaranteed.
    124    if (NS_SUCCEEDED(OSPreferences::GetInstance()->GetRegionalPrefsLocales(
    125            regionalPrefsLocales))) {
    126      // use lowercase for all language atoms
    127      ToLowerCase(regionalPrefsLocales[0]);
    128      mLocaleLanguage = NS_Atomize(regionalPrefsLocales[0]);
    129    } else {
    130      nsAutoCString locale;
    131      OSPreferences::GetInstance()->GetSystemLocale(locale);
    132 
    133      ToLowerCase(locale);  // use lowercase for all language atoms
    134      mLocaleLanguage = NS_Atomize(locale);
    135    }
    136  }
    137 
    138  return mLocaleLanguage;
    139 }
    140 
    141 nsStaticAtom* nsLanguageAtomService::GetLanguageGroup(nsAtom* aLanguage) {
    142  {
    143    AutoReadLock lock(mLock);
    144    if (nsStaticAtom* atom = mLangToGroup.Get(aLanguage)) {
    145      return atom;
    146    }
    147  }
    148 
    149  AutoWriteLock lock(mLock);
    150  return mLangToGroup.LookupOrInsertWith(
    151      aLanguage, [&] { return GetUncachedLanguageGroup(aLanguage); });
    152 }
    153 
    154 nsStaticAtom* nsLanguageAtomService::GetUncachedLanguageGroup(
    155    nsAtom* aLanguage) const {
    156  nsAutoCString langStr;
    157  aLanguage->ToUTF8String(langStr);
    158  ToLowerCase(langStr);
    159 
    160  if (langStr[0] == 'x' && langStr[1] == '-') {
    161    // Internal x-* langGroup codes map to themselves (see bug 256257)
    162    for (nsStaticAtom* langGroup : kLangGroups) {
    163      if (langGroup == aLanguage) {
    164        return langGroup;
    165      }
    166      if (aLanguage->IsAsciiLowercase()) {
    167        continue;
    168      }
    169      // Do the slow ascii-case-insensitive comparison just if needed.
    170      nsDependentAtomString string(langGroup);
    171      if (string.EqualsASCII(langStr.get(), langStr.Length())) {
    172        return langGroup;
    173      }
    174    }
    175  } else {
    176    // If the lang code can be parsed as BCP47, look up its (likely) script.
    177 
    178    // https://bugzilla.mozilla.org/show_bug.cgi?id=1618034:
    179    // First strip any private subtags that would cause Locale to reject the
    180    // tag as non-wellformed.
    181    nsACString::const_iterator start, end;
    182    langStr.BeginReading(start);
    183    langStr.EndReading(end);
    184    if (FindInReadable("-x-"_ns, start, end)) {
    185      // The substring we want ends at the beginning of the "-x-" subtag.
    186      langStr.Truncate(start.get() - langStr.BeginReading());
    187    }
    188 
    189    intl::Locale loc;
    190    auto result = intl::LocaleParser::TryParse(langStr, loc);
    191    if (!result.isOk()) {
    192      // Did the author (wrongly) use '_' instead of '-' to separate subtags?
    193      // If so, fix it up and re-try parsing.
    194      if (langStr.Contains('_')) {
    195        langStr.ReplaceChar('_', '-');
    196 
    197        // Throw away the partially parsed locale and re-start parsing.
    198        loc = {};
    199        result = intl::LocaleParser::TryParse(langStr, loc);
    200      }
    201    }
    202    if (result.isOk() && loc.Canonicalize().isOk()) {
    203      // Fill in script subtag if not present.
    204      if (loc.Script().Missing()) {
    205        // No script. At this point it's fair to assume that en-* maps to
    206        // x-western. This fast path avoids the slow call to AddLikelySubtags.
    207        if (loc.Language().EqualTo("en")) {
    208          return nsGkAtoms::x_western;
    209        }
    210 
    211        if (loc.AddLikelySubtags().isErr()) {
    212          // Fall back to x-unicode if no match was found
    213          return nsGkAtoms::Unicode;
    214        }
    215      }
    216      // Traditional Chinese has separate prefs for Hong Kong / Taiwan;
    217      // check the region subtag.
    218      if (loc.Script().EqualTo("Hant")) {
    219        if (loc.Region().EqualTo("HK")) {
    220          return nsGkAtoms::HongKongChinese;
    221        }
    222        return nsGkAtoms::Taiwanese;
    223      }
    224      // Search list of known script subtags that map to langGroup codes.
    225      size_t foundIndex;
    226      Span<const char> scriptAsSpan = loc.Script().Span();
    227      nsDependentCSubstring script(scriptAsSpan.data(), scriptAsSpan.size());
    228      if (BinarySearchIf(
    229              kScriptLangGroup, 0, std::size(kScriptLangGroup),
    230              [script](const auto& entry) -> int {
    231                return Compare(script, nsDependentCString(entry.mTag));
    232              },
    233              &foundIndex)) {
    234        return kScriptLangGroup[foundIndex].mAtom;
    235      }
    236      // Script subtag was not recognized (includes "Hani"); check the language
    237      // subtag for CJK possibilities so that we'll prefer the appropriate font
    238      // rather than falling back to the browser's hardcoded preference.
    239      if (loc.Language().EqualTo("zh")) {
    240        if (loc.Region().EqualTo("HK")) {
    241          return nsGkAtoms::HongKongChinese;
    242        }
    243        if (loc.Region().EqualTo("TW")) {
    244          return nsGkAtoms::Taiwanese;
    245        }
    246        return nsGkAtoms::Chinese;
    247      }
    248      if (loc.Language().EqualTo("ja")) {
    249        return nsGkAtoms::Japanese;
    250      }
    251      if (loc.Language().EqualTo("ko")) {
    252        return nsGkAtoms::ko;
    253      }
    254    }
    255  }
    256 
    257  // Fall back to x-unicode if no match was found
    258  return nsGkAtoms::Unicode;
    259 }