tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsHyphenator.cpp (16742B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #include "nsHyphenator.h"
      7 
      8 #include "mozilla/dom/ContentChild.h"
      9 #include "mozilla/Omnijar.h"
     10 #include "mozilla/ipc/SharedMemoryHandle.h"
     11 #include "mozilla/ipc/SharedMemoryMapping.h"
     12 #include "nsContentUtils.h"
     13 #include "nsEscape.h"
     14 #include "nsIChannel.h"
     15 #include "nsIFile.h"
     16 #include "nsIFileURL.h"
     17 #include "nsIInputStream.h"
     18 #include "nsIJARURI.h"
     19 #include "nsIURI.h"
     20 #include "nsNetUtil.h"
     21 #include "nsUnicodeProperties.h"
     22 #include "nsUTF8Utils.h"
     23 #include "nsZipArchive.h"
     24 
     25 #include "mapped_hyph.h"
     26 
     27 using namespace mozilla;
     28 
     29 namespace std {
     30 void default_delete<const HyphDic>::operator()(const HyphDic* aHyph) const {
     31  mapped_hyph_free_dictionary(const_cast<HyphDic*>(aHyph));
     32 }
     33 
     34 void default_delete<const CompiledData>::operator()(
     35    const CompiledData* aData) const {
     36  mapped_hyph_free_compiled_data(const_cast<CompiledData*>(aData));
     37 }
     38 }  // namespace std
     39 
     40 static const uint8_t* GetItemPtrFromJarURI(nsIJARURI* aJAR, uint32_t* aLength) {
     41  // Try to get the jarfile's nsZipArchive, find the relevant item, and return
     42  // a pointer to its data provided it is stored uncompressed.
     43  nsCOMPtr<nsIURI> jarFile;
     44  if (NS_FAILED(aJAR->GetJARFile(getter_AddRefs(jarFile)))) {
     45    return nullptr;
     46  }
     47  nsCOMPtr<nsIFileURL> fileUrl = do_QueryInterface(jarFile);
     48  if (!fileUrl) {
     49    return nullptr;
     50  }
     51  nsCOMPtr<nsIFile> file;
     52  fileUrl->GetFile(getter_AddRefs(file));
     53  if (!file) {
     54    return nullptr;
     55  }
     56  RefPtr<nsZipArchive> archive = Omnijar::GetReader(file);
     57  if (archive) {
     58    nsCString path;
     59    aJAR->GetJAREntry(path);
     60    nsZipItem* item = archive->GetItem(path);
     61    if (item && item->Compression() == 0 && item->Size() > 0) {
     62      // We do NOT own this data, but it won't go away until the omnijar
     63      // file is closed during shutdown.
     64      const uint8_t* data = archive->GetData(item);
     65      if (data) {
     66        *aLength = item->Size();
     67        return data;
     68      }
     69    }
     70  }
     71  return nullptr;
     72 }
     73 
     74 static ipc::ReadOnlySharedMemoryMapping GetHyphDictFromParent(nsIURI* aURI) {
     75  MOZ_ASSERT(!XRE_IsParentProcess());
     76  ipc::ReadOnlySharedMemoryHandle handle;
     77  MOZ_ASSERT(aURI);
     78  if (!dom::ContentChild::GetSingleton()->SendGetHyphDict(aURI, &handle)) {
     79    return nullptr;
     80  }
     81  if (!handle.IsValid()) {
     82    return nullptr;
     83  }
     84  auto map = handle.Map();
     85  if (!map) {
     86    return nullptr;
     87  }
     88  if (!map.Address()) {
     89    return nullptr;
     90  }
     91  return map;
     92 }
     93 
     94 static ipc::ReadOnlySharedMemoryHandle CopyToShmem(const CompiledData* aData) {
     95  MOZ_ASSERT(XRE_IsParentProcess());
     96 
     97  // The shm-related calls here are not expected to fail, but if they do,
     98  // we'll just return null (as if the resource was unavailable) and proceed
     99  // without hyphenation.
    100  uint32_t size = mapped_hyph_compiled_data_size(aData);
    101  auto handle = ipc::shared_memory::CreateFreezable(size);
    102  if (!handle) {
    103    return nullptr;
    104  }
    105  auto map = std::move(handle).Map();
    106  if (!map) {
    107    return nullptr;
    108  }
    109  char* buffer = map.DataAs<char>();
    110  if (!buffer) {
    111    return nullptr;
    112  }
    113 
    114  memcpy(buffer, mapped_hyph_compiled_data_ptr(aData), size);
    115  return std::move(map).Freeze();
    116 }
    117 
    118 static ipc::ReadOnlySharedMemoryHandle LoadFromURI(nsIURI* aURI,
    119                                                   bool aPrecompiled) {
    120  MOZ_ASSERT(XRE_IsParentProcess());
    121  nsCOMPtr<nsIChannel> channel;
    122  if (NS_FAILED(NS_NewChannel(
    123          getter_AddRefs(channel), aURI, nsContentUtils::GetSystemPrincipal(),
    124          nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_SEC_CONTEXT_IS_NULL,
    125          nsIContentPolicy::TYPE_OTHER))) {
    126    return nullptr;
    127  }
    128  nsCOMPtr<nsIInputStream> instream;
    129  if (NS_FAILED(channel->Open(getter_AddRefs(instream)))) {
    130    return nullptr;
    131  }
    132  // Check size, bail out if it is excessively large (the largest of the
    133  // hyphenation files currently shipped with Firefox is around 1MB
    134  // uncompressed).
    135  uint64_t available;
    136  if (NS_FAILED(instream->Available(&available)) || !available ||
    137      available > 16 * 1024 * 1024) {
    138    return nullptr;
    139  }
    140 
    141  if (aPrecompiled) {
    142    auto handle = ipc::shared_memory::CreateFreezable(available);
    143    if (!handle) {
    144      return nullptr;
    145    }
    146    auto map = std::move(handle).Map();
    147    if (!map) {
    148      return nullptr;
    149    }
    150    char* buffer = map.DataAs<char>();
    151    if (!buffer) {
    152      return nullptr;
    153    }
    154 
    155    uint32_t bytesRead = 0;
    156    if (NS_FAILED(instream->Read(buffer, available, &bytesRead)) ||
    157        bytesRead != available) {
    158      return nullptr;
    159    }
    160 
    161    if (!mapped_hyph_is_valid_hyphenator(
    162            reinterpret_cast<const uint8_t*>(buffer), bytesRead)) {
    163      return nullptr;
    164    }
    165 
    166    return std::move(map).Freeze();
    167  }
    168 
    169  // Read from the URI into a temporary buffer, compile it, then copy the
    170  // compiled resource to a shared memory region.
    171  auto buffer = MakeUnique<char[]>(available);
    172  uint32_t bytesRead = 0;
    173  if (NS_FAILED(instream->Read(buffer.get(), available, &bytesRead)) ||
    174      bytesRead != available) {
    175    return nullptr;
    176  }
    177 
    178  UniquePtr<const CompiledData> data(mapped_hyph_compile_buffer(
    179      reinterpret_cast<const uint8_t*>(buffer.get()), bytesRead, false));
    180  if (data) {
    181    return CopyToShmem(data.get());
    182  }
    183 
    184  return nullptr;
    185 }
    186 
    187 nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized)
    188    : mDict(Span<const uint8_t>()),
    189      mHyphenateCapitalized(aHyphenateCapitalized) {
    190  // Files with extension ".hyf" are expected to be precompiled mapped_hyph
    191  // tables; we also support uncompiled ".dic" files, but they are more
    192  // expensive to process on first load.
    193  nsAutoCString path;
    194  aURI->GetFilePath(path);
    195  bool precompiled = StringEndsWith(path, ".hyf"_ns);
    196 
    197  // Content processes don't do compilation; they depend on the parent giving
    198  // them a compiled version of the resource, so that we only pay the cost of
    199  // compilation once per language per session.
    200  if (!precompiled && !XRE_IsParentProcess()) {
    201    auto shm = GetHyphDictFromParent(aURI);
    202    if (shm) {
    203      // We don't need to validate mDict because the parent process
    204      // will have done so.
    205      mDict.emplace<ipc::ReadOnlySharedMemoryMapping>(std::move(shm));
    206    }
    207    return;
    208  }
    209 
    210  nsCOMPtr<nsIJARURI> jar = do_QueryInterface(aURI);
    211  if (jar) {
    212    // This gives us a raw pointer into the omnijar's data (if uncompressed);
    213    // we do not own it and must not attempt to free it!
    214    uint32_t length;
    215    const uint8_t* ptr = GetItemPtrFromJarURI(jar, &length);
    216    if (ptr) {
    217      if (precompiled) {
    218        // The data should be directly usable by mapped_hyph; validate that it
    219        // looks correct, and save the pointer.
    220        if (mapped_hyph_is_valid_hyphenator(ptr, length)) {
    221          mDict.emplace<Span<const uint8_t>>(ptr, length);
    222          return;
    223        }
    224      } else {
    225        // The data is an uncompiled pattern file, so we need to compile it.
    226        // We then move it to shared memory so we can expose it to content
    227        // processes.
    228        MOZ_ASSERT(XRE_IsParentProcess());
    229        UniquePtr<const CompiledData> data(
    230            mapped_hyph_compile_buffer(ptr, length, false));
    231        if (data) {
    232          auto shm = CopyToShmem(data.get());
    233          if (shm) {
    234            mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm));
    235            return;
    236          }
    237        }
    238      }
    239    } else {
    240      // Omnijar must be compressed (currently this is the case on Android).
    241      // If we're the parent process, decompress the resource into a shmem
    242      // buffer; if we're a child, send a request to the parent for the
    243      // shared-memory copy (which it will load if not already available).
    244      if (XRE_IsParentProcess()) {
    245        auto shm = LoadFromURI(aURI, precompiled);
    246        if (shm) {
    247          mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm));
    248          return;
    249        }
    250      } else {
    251        auto shm = GetHyphDictFromParent(aURI);
    252        if (shm) {
    253          // We don't need to validate mDict because the parent process
    254          // will have done so.
    255          mDict.emplace<ipc::ReadOnlySharedMemoryMapping>(std::move(shm));
    256          return;
    257        }
    258      }
    259    }
    260  }
    261 
    262  // We get file:// URIs when running an unpackaged build; they could also
    263  // occur if we support adding hyphenation dictionaries by putting files in
    264  // a directory of the profile, for example.
    265  if (aURI->SchemeIs("file")) {
    266    // Ask the Rust lib to mmap the file. In this case our mDictSize field
    267    // remains zero; mDict is not a pointer to the raw data but an opaque
    268    // reference to a Rust object, and can only be freed by passing it to
    269    // mapped_hyph_free_dictionary().
    270    // (This case occurs in unpackaged developer builds.)
    271 #if XP_WIN
    272    // GetFilePath returns the path with an unexpected leading slash (like
    273    // "/c:/path/to/firefox/...") that may prevent it being found if it's an
    274    // absolute Windows path starting with a drive letter.
    275    // So check for this case and strip the slash.
    276    if (path.Length() > 2 && path[0] == '/' && path[2] == ':') {
    277      path.Cut(0, 1);
    278    }
    279 #endif
    280    // In case of %-escaped spaces or other "special" chars in the path,
    281    // we need the unescaped version to pass to mapped_hyph_load_dictionary.
    282    NS_UnescapeURL(path);
    283    if (precompiled) {
    284      // If the file is compiled, we can just map it directly.
    285      UniquePtr<const HyphDic> dic(mapped_hyph_load_dictionary(path.get()));
    286      if (dic) {
    287        mDict = AsVariant(std::move(dic));
    288        return;
    289      }
    290    } else {
    291      // For an uncompiled .dic file, the parent process is responsible for
    292      // compiling it and storing the result in a shmem block that can be
    293      // shared to content processes.
    294      MOZ_ASSERT(XRE_IsParentProcess());
    295      MOZ_ASSERT(StringEndsWith(path, ".dic"_ns));
    296      UniquePtr<const CompiledData> data(
    297          mapped_hyph_compile_file(path.get(), false));
    298      if (data) {
    299        auto shm = CopyToShmem(data.get());
    300        if (shm) {
    301          mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm));
    302          return;
    303        }
    304      }
    305    }
    306  }
    307 
    308  // Each loading branch above will return if successful. So if we get here,
    309  // whichever load type we attempted must have failed because something about
    310  // the resource is broken.
    311  nsAutoCString msg;
    312  aURI->GetSpec(msg);
    313  msg.Insert("Invalid hyphenation resource: ", 0);
    314  NS_ASSERTION(false, msg.get());
    315 }
    316 
    317 bool nsHyphenator::IsValid() {
    318  return mDict.match(
    319      [](Span<const uint8_t>& span) { return !span.IsEmpty(); },
    320      [](ipc::ReadOnlySharedMemoryHandle& shm) { return shm.IsValid(); },
    321      [](ipc::ReadOnlySharedMemoryMapping& shm) { return shm.IsValid(); },
    322      [](UniquePtr<const HyphDic>& hyph) { return hyph != nullptr; });
    323 }
    324 
    325 nsresult nsHyphenator::Hyphenate(const nsAString& aString,
    326                                 nsTArray<bool>& aHyphens) {
    327  if (!aHyphens.SetLength(aString.Length(), fallible)) {
    328    return NS_ERROR_OUT_OF_MEMORY;
    329  }
    330  memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool));
    331 
    332  bool inWord = false;
    333  uint32_t wordStart = 0, wordLimit = 0;
    334  uint32_t chLen;
    335  for (uint32_t i = 0; i < aString.Length(); i += chLen) {
    336    uint32_t ch = aString[i];
    337    chLen = 1;
    338 
    339    if (NS_IS_HIGH_SURROGATE(ch)) {
    340      if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i + 1])) {
    341        ch = SURROGATE_TO_UCS4(ch, aString[i + 1]);
    342        chLen = 2;
    343      } else {
    344        NS_WARNING("unpaired surrogate found during hyphenation");
    345      }
    346    }
    347 
    348    nsUGenCategory cat = unicode::GetGenCategory(ch);
    349    if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) {
    350      if (!inWord) {
    351        inWord = true;
    352        wordStart = i;
    353      }
    354      wordLimit = i + chLen;
    355      if (i + chLen < aString.Length()) {
    356        continue;
    357      }
    358    }
    359 
    360    if (inWord) {
    361      HyphenateWord(aString, wordStart, wordLimit, aHyphens);
    362      inWord = false;
    363    }
    364  }
    365 
    366  return NS_OK;
    367 }
    368 
    369 void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
    370                                 uint32_t aLimit, nsTArray<bool>& aHyphens) {
    371  // Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph,
    372  // lowercasing it as we go so that it will match the (lowercased) patterns
    373  // (bug 1105644).
    374  nsAutoCString utf8;
    375  const char16_t* cur = aString.BeginReading() + aStart;
    376  const char16_t* end = aString.BeginReading() + aLimit;
    377  bool firstLetter = true;
    378  while (cur < end) {
    379    uint32_t ch = *cur++;
    380 
    381    if (NS_IS_HIGH_SURROGATE(ch)) {
    382      if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
    383        ch = SURROGATE_TO_UCS4(ch, *cur++);
    384      } else {
    385        return;  // unpaired surrogate: bail out, don't hyphenate broken text
    386      }
    387    } else if (NS_IS_LOW_SURROGATE(ch)) {
    388      return;  // unpaired surrogate
    389    }
    390 
    391    // XXX What about language-specific casing? Consider Turkish I/i...
    392    // In practice, it looks like the current patterns will not be
    393    // affected by this, as they treat dotted and undotted i similarly.
    394    uint32_t origCh = ch;
    395    ch = ToLowerCase(ch);
    396 
    397    if (ch != origCh) {
    398      // Avoid hyphenating capitalized words (bug 1550532) unless explicitly
    399      // allowed by prefs for the language in use.
    400      // Also never auto-hyphenate a word that has internal caps, as it may
    401      // well be an all-caps acronym or a quirky name like iTunes.
    402      if (!mHyphenateCapitalized || !firstLetter) {
    403        return;
    404      }
    405    }
    406    firstLetter = false;
    407 
    408    if (ch < 0x80) {  // U+0000 - U+007F
    409      utf8.Append(ch);
    410    } else if (ch < 0x0800) {  // U+0100 - U+07FF
    411      utf8.Append(0xC0 | (ch >> 6));
    412      utf8.Append(0x80 | (0x003F & ch));
    413    } else if (ch < 0x10000) {  // U+0800 - U+D7FF,U+E000 - U+FFFF
    414      utf8.Append(0xE0 | (ch >> 12));
    415      utf8.Append(0x80 | (0x003F & (ch >> 6)));
    416      utf8.Append(0x80 | (0x003F & ch));
    417    } else {
    418      utf8.Append(0xF0 | (ch >> 18));
    419      utf8.Append(0x80 | (0x003F & (ch >> 12)));
    420      utf8.Append(0x80 | (0x003F & (ch >> 6)));
    421      utf8.Append(0x80 | (0x003F & ch));
    422    }
    423  }
    424 
    425  AutoTArray<uint8_t, 200> hyphenValues;
    426  hyphenValues.SetLength(utf8.Length());
    427  int32_t result = mDict.match(
    428      [&](Span<const uint8_t>& span) {
    429        return mapped_hyph_find_hyphen_values_raw(
    430            span.data(), span.size(), utf8.BeginReading(), utf8.Length(),
    431            hyphenValues.Elements(), hyphenValues.Length());
    432      },
    433      [&](ipc::ReadOnlySharedMemoryHandle& shm) {
    434        // Only the parent process can have a handle stored. We should never
    435        // get to this point with just a handle.
    436        MOZ_ASSERT_UNREACHABLE("Unexpected HyphenateWord with only a handle");
    437        return 0;
    438      },
    439      [&](ipc::ReadOnlySharedMemoryMapping& shm) {
    440        return mapped_hyph_find_hyphen_values_raw(
    441            shm.DataAs<uint8_t>(), shm.Size(), utf8.BeginReading(),
    442            utf8.Length(), hyphenValues.Elements(), hyphenValues.Length());
    443      },
    444      [&](UniquePtr<const HyphDic>& hyph) {
    445        return mapped_hyph_find_hyphen_values_dic(
    446            hyph.get(), utf8.BeginReading(), utf8.Length(),
    447            hyphenValues.Elements(), hyphenValues.Length());
    448      });
    449  if (result > 0) {
    450    // We need to convert UTF-8 indexing as used by the hyphenation lib into
    451    // UTF-16 indexing of the aHyphens[] array for Gecko.
    452    uint32_t utf16index = 0;
    453    for (uint32_t utf8index = 0; utf8index < utf8.Length();) {
    454      // We know utf8 is valid, so we only need to look at the first byte of
    455      // each character to determine its length and the corresponding UTF-16
    456      // length to add to utf16index.
    457      const uint8_t leadByte = utf8[utf8index];
    458      if (leadByte < 0x80) {
    459        utf8index += 1;
    460      } else if (leadByte < 0xE0) {
    461        utf8index += 2;
    462      } else if (leadByte < 0xF0) {
    463        utf8index += 3;
    464      } else {
    465        utf8index += 4;
    466      }
    467      // The hyphenation value of interest is the one for the last code unit
    468      // of the utf-8 character, and is recorded on the last code unit of the
    469      // utf-16 character (in the case of a surrogate pair).
    470      utf16index += leadByte >= 0xF0 ? 2 : 1;
    471      if (utf16index > 0 && (hyphenValues[utf8index - 1] & 0x01)) {
    472        aHyphens[aStart + utf16index - 1] = true;
    473      }
    474    }
    475  }
    476 }
    477 
    478 ipc::ReadOnlySharedMemoryHandle nsHyphenator::CloneHandle() {
    479  MOZ_ASSERT(XRE_IsParentProcess());
    480 
    481  if (mDict.is<ipc::ReadOnlySharedMemoryHandle>()) {
    482    return mDict.as<ipc::ReadOnlySharedMemoryHandle>().Clone();
    483  }
    484  return nullptr;
    485 }