nsHyphenator.cpp (16742B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "nsHyphenator.h" 7 8 #include "mozilla/dom/ContentChild.h" 9 #include "mozilla/Omnijar.h" 10 #include "mozilla/ipc/SharedMemoryHandle.h" 11 #include "mozilla/ipc/SharedMemoryMapping.h" 12 #include "nsContentUtils.h" 13 #include "nsEscape.h" 14 #include "nsIChannel.h" 15 #include "nsIFile.h" 16 #include "nsIFileURL.h" 17 #include "nsIInputStream.h" 18 #include "nsIJARURI.h" 19 #include "nsIURI.h" 20 #include "nsNetUtil.h" 21 #include "nsUnicodeProperties.h" 22 #include "nsUTF8Utils.h" 23 #include "nsZipArchive.h" 24 25 #include "mapped_hyph.h" 26 27 using namespace mozilla; 28 29 namespace std { 30 void default_delete<const HyphDic>::operator()(const HyphDic* aHyph) const { 31 mapped_hyph_free_dictionary(const_cast<HyphDic*>(aHyph)); 32 } 33 34 void default_delete<const CompiledData>::operator()( 35 const CompiledData* aData) const { 36 mapped_hyph_free_compiled_data(const_cast<CompiledData*>(aData)); 37 } 38 } // namespace std 39 40 static const uint8_t* GetItemPtrFromJarURI(nsIJARURI* aJAR, uint32_t* aLength) { 41 // Try to get the jarfile's nsZipArchive, find the relevant item, and return 42 // a pointer to its data provided it is stored uncompressed. 43 nsCOMPtr<nsIURI> jarFile; 44 if (NS_FAILED(aJAR->GetJARFile(getter_AddRefs(jarFile)))) { 45 return nullptr; 46 } 47 nsCOMPtr<nsIFileURL> fileUrl = do_QueryInterface(jarFile); 48 if (!fileUrl) { 49 return nullptr; 50 } 51 nsCOMPtr<nsIFile> file; 52 fileUrl->GetFile(getter_AddRefs(file)); 53 if (!file) { 54 return nullptr; 55 } 56 RefPtr<nsZipArchive> archive = Omnijar::GetReader(file); 57 if (archive) { 58 nsCString path; 59 aJAR->GetJAREntry(path); 60 nsZipItem* item = archive->GetItem(path); 61 if (item && item->Compression() == 0 && item->Size() > 0) { 62 // We do NOT own this data, but it won't go away until the omnijar 63 // file is closed during shutdown. 64 const uint8_t* data = archive->GetData(item); 65 if (data) { 66 *aLength = item->Size(); 67 return data; 68 } 69 } 70 } 71 return nullptr; 72 } 73 74 static ipc::ReadOnlySharedMemoryMapping GetHyphDictFromParent(nsIURI* aURI) { 75 MOZ_ASSERT(!XRE_IsParentProcess()); 76 ipc::ReadOnlySharedMemoryHandle handle; 77 MOZ_ASSERT(aURI); 78 if (!dom::ContentChild::GetSingleton()->SendGetHyphDict(aURI, &handle)) { 79 return nullptr; 80 } 81 if (!handle.IsValid()) { 82 return nullptr; 83 } 84 auto map = handle.Map(); 85 if (!map) { 86 return nullptr; 87 } 88 if (!map.Address()) { 89 return nullptr; 90 } 91 return map; 92 } 93 94 static ipc::ReadOnlySharedMemoryHandle CopyToShmem(const CompiledData* aData) { 95 MOZ_ASSERT(XRE_IsParentProcess()); 96 97 // The shm-related calls here are not expected to fail, but if they do, 98 // we'll just return null (as if the resource was unavailable) and proceed 99 // without hyphenation. 100 uint32_t size = mapped_hyph_compiled_data_size(aData); 101 auto handle = ipc::shared_memory::CreateFreezable(size); 102 if (!handle) { 103 return nullptr; 104 } 105 auto map = std::move(handle).Map(); 106 if (!map) { 107 return nullptr; 108 } 109 char* buffer = map.DataAs<char>(); 110 if (!buffer) { 111 return nullptr; 112 } 113 114 memcpy(buffer, mapped_hyph_compiled_data_ptr(aData), size); 115 return std::move(map).Freeze(); 116 } 117 118 static ipc::ReadOnlySharedMemoryHandle LoadFromURI(nsIURI* aURI, 119 bool aPrecompiled) { 120 MOZ_ASSERT(XRE_IsParentProcess()); 121 nsCOMPtr<nsIChannel> channel; 122 if (NS_FAILED(NS_NewChannel( 123 getter_AddRefs(channel), aURI, nsContentUtils::GetSystemPrincipal(), 124 nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_SEC_CONTEXT_IS_NULL, 125 nsIContentPolicy::TYPE_OTHER))) { 126 return nullptr; 127 } 128 nsCOMPtr<nsIInputStream> instream; 129 if (NS_FAILED(channel->Open(getter_AddRefs(instream)))) { 130 return nullptr; 131 } 132 // Check size, bail out if it is excessively large (the largest of the 133 // hyphenation files currently shipped with Firefox is around 1MB 134 // uncompressed). 135 uint64_t available; 136 if (NS_FAILED(instream->Available(&available)) || !available || 137 available > 16 * 1024 * 1024) { 138 return nullptr; 139 } 140 141 if (aPrecompiled) { 142 auto handle = ipc::shared_memory::CreateFreezable(available); 143 if (!handle) { 144 return nullptr; 145 } 146 auto map = std::move(handle).Map(); 147 if (!map) { 148 return nullptr; 149 } 150 char* buffer = map.DataAs<char>(); 151 if (!buffer) { 152 return nullptr; 153 } 154 155 uint32_t bytesRead = 0; 156 if (NS_FAILED(instream->Read(buffer, available, &bytesRead)) || 157 bytesRead != available) { 158 return nullptr; 159 } 160 161 if (!mapped_hyph_is_valid_hyphenator( 162 reinterpret_cast<const uint8_t*>(buffer), bytesRead)) { 163 return nullptr; 164 } 165 166 return std::move(map).Freeze(); 167 } 168 169 // Read from the URI into a temporary buffer, compile it, then copy the 170 // compiled resource to a shared memory region. 171 auto buffer = MakeUnique<char[]>(available); 172 uint32_t bytesRead = 0; 173 if (NS_FAILED(instream->Read(buffer.get(), available, &bytesRead)) || 174 bytesRead != available) { 175 return nullptr; 176 } 177 178 UniquePtr<const CompiledData> data(mapped_hyph_compile_buffer( 179 reinterpret_cast<const uint8_t*>(buffer.get()), bytesRead, false)); 180 if (data) { 181 return CopyToShmem(data.get()); 182 } 183 184 return nullptr; 185 } 186 187 nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized) 188 : mDict(Span<const uint8_t>()), 189 mHyphenateCapitalized(aHyphenateCapitalized) { 190 // Files with extension ".hyf" are expected to be precompiled mapped_hyph 191 // tables; we also support uncompiled ".dic" files, but they are more 192 // expensive to process on first load. 193 nsAutoCString path; 194 aURI->GetFilePath(path); 195 bool precompiled = StringEndsWith(path, ".hyf"_ns); 196 197 // Content processes don't do compilation; they depend on the parent giving 198 // them a compiled version of the resource, so that we only pay the cost of 199 // compilation once per language per session. 200 if (!precompiled && !XRE_IsParentProcess()) { 201 auto shm = GetHyphDictFromParent(aURI); 202 if (shm) { 203 // We don't need to validate mDict because the parent process 204 // will have done so. 205 mDict.emplace<ipc::ReadOnlySharedMemoryMapping>(std::move(shm)); 206 } 207 return; 208 } 209 210 nsCOMPtr<nsIJARURI> jar = do_QueryInterface(aURI); 211 if (jar) { 212 // This gives us a raw pointer into the omnijar's data (if uncompressed); 213 // we do not own it and must not attempt to free it! 214 uint32_t length; 215 const uint8_t* ptr = GetItemPtrFromJarURI(jar, &length); 216 if (ptr) { 217 if (precompiled) { 218 // The data should be directly usable by mapped_hyph; validate that it 219 // looks correct, and save the pointer. 220 if (mapped_hyph_is_valid_hyphenator(ptr, length)) { 221 mDict.emplace<Span<const uint8_t>>(ptr, length); 222 return; 223 } 224 } else { 225 // The data is an uncompiled pattern file, so we need to compile it. 226 // We then move it to shared memory so we can expose it to content 227 // processes. 228 MOZ_ASSERT(XRE_IsParentProcess()); 229 UniquePtr<const CompiledData> data( 230 mapped_hyph_compile_buffer(ptr, length, false)); 231 if (data) { 232 auto shm = CopyToShmem(data.get()); 233 if (shm) { 234 mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm)); 235 return; 236 } 237 } 238 } 239 } else { 240 // Omnijar must be compressed (currently this is the case on Android). 241 // If we're the parent process, decompress the resource into a shmem 242 // buffer; if we're a child, send a request to the parent for the 243 // shared-memory copy (which it will load if not already available). 244 if (XRE_IsParentProcess()) { 245 auto shm = LoadFromURI(aURI, precompiled); 246 if (shm) { 247 mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm)); 248 return; 249 } 250 } else { 251 auto shm = GetHyphDictFromParent(aURI); 252 if (shm) { 253 // We don't need to validate mDict because the parent process 254 // will have done so. 255 mDict.emplace<ipc::ReadOnlySharedMemoryMapping>(std::move(shm)); 256 return; 257 } 258 } 259 } 260 } 261 262 // We get file:// URIs when running an unpackaged build; they could also 263 // occur if we support adding hyphenation dictionaries by putting files in 264 // a directory of the profile, for example. 265 if (aURI->SchemeIs("file")) { 266 // Ask the Rust lib to mmap the file. In this case our mDictSize field 267 // remains zero; mDict is not a pointer to the raw data but an opaque 268 // reference to a Rust object, and can only be freed by passing it to 269 // mapped_hyph_free_dictionary(). 270 // (This case occurs in unpackaged developer builds.) 271 #if XP_WIN 272 // GetFilePath returns the path with an unexpected leading slash (like 273 // "/c:/path/to/firefox/...") that may prevent it being found if it's an 274 // absolute Windows path starting with a drive letter. 275 // So check for this case and strip the slash. 276 if (path.Length() > 2 && path[0] == '/' && path[2] == ':') { 277 path.Cut(0, 1); 278 } 279 #endif 280 // In case of %-escaped spaces or other "special" chars in the path, 281 // we need the unescaped version to pass to mapped_hyph_load_dictionary. 282 NS_UnescapeURL(path); 283 if (precompiled) { 284 // If the file is compiled, we can just map it directly. 285 UniquePtr<const HyphDic> dic(mapped_hyph_load_dictionary(path.get())); 286 if (dic) { 287 mDict = AsVariant(std::move(dic)); 288 return; 289 } 290 } else { 291 // For an uncompiled .dic file, the parent process is responsible for 292 // compiling it and storing the result in a shmem block that can be 293 // shared to content processes. 294 MOZ_ASSERT(XRE_IsParentProcess()); 295 MOZ_ASSERT(StringEndsWith(path, ".dic"_ns)); 296 UniquePtr<const CompiledData> data( 297 mapped_hyph_compile_file(path.get(), false)); 298 if (data) { 299 auto shm = CopyToShmem(data.get()); 300 if (shm) { 301 mDict.emplace<ipc::ReadOnlySharedMemoryHandle>(std::move(shm)); 302 return; 303 } 304 } 305 } 306 } 307 308 // Each loading branch above will return if successful. So if we get here, 309 // whichever load type we attempted must have failed because something about 310 // the resource is broken. 311 nsAutoCString msg; 312 aURI->GetSpec(msg); 313 msg.Insert("Invalid hyphenation resource: ", 0); 314 NS_ASSERTION(false, msg.get()); 315 } 316 317 bool nsHyphenator::IsValid() { 318 return mDict.match( 319 [](Span<const uint8_t>& span) { return !span.IsEmpty(); }, 320 [](ipc::ReadOnlySharedMemoryHandle& shm) { return shm.IsValid(); }, 321 [](ipc::ReadOnlySharedMemoryMapping& shm) { return shm.IsValid(); }, 322 [](UniquePtr<const HyphDic>& hyph) { return hyph != nullptr; }); 323 } 324 325 nsresult nsHyphenator::Hyphenate(const nsAString& aString, 326 nsTArray<bool>& aHyphens) { 327 if (!aHyphens.SetLength(aString.Length(), fallible)) { 328 return NS_ERROR_OUT_OF_MEMORY; 329 } 330 memset(aHyphens.Elements(), false, aHyphens.Length() * sizeof(bool)); 331 332 bool inWord = false; 333 uint32_t wordStart = 0, wordLimit = 0; 334 uint32_t chLen; 335 for (uint32_t i = 0; i < aString.Length(); i += chLen) { 336 uint32_t ch = aString[i]; 337 chLen = 1; 338 339 if (NS_IS_HIGH_SURROGATE(ch)) { 340 if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i + 1])) { 341 ch = SURROGATE_TO_UCS4(ch, aString[i + 1]); 342 chLen = 2; 343 } else { 344 NS_WARNING("unpaired surrogate found during hyphenation"); 345 } 346 } 347 348 nsUGenCategory cat = unicode::GetGenCategory(ch); 349 if (cat == nsUGenCategory::kLetter || cat == nsUGenCategory::kMark) { 350 if (!inWord) { 351 inWord = true; 352 wordStart = i; 353 } 354 wordLimit = i + chLen; 355 if (i + chLen < aString.Length()) { 356 continue; 357 } 358 } 359 360 if (inWord) { 361 HyphenateWord(aString, wordStart, wordLimit, aHyphens); 362 inWord = false; 363 } 364 } 365 366 return NS_OK; 367 } 368 369 void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart, 370 uint32_t aLimit, nsTArray<bool>& aHyphens) { 371 // Convert word from aStart and aLimit in aString to utf-8 for mapped_hyph, 372 // lowercasing it as we go so that it will match the (lowercased) patterns 373 // (bug 1105644). 374 nsAutoCString utf8; 375 const char16_t* cur = aString.BeginReading() + aStart; 376 const char16_t* end = aString.BeginReading() + aLimit; 377 bool firstLetter = true; 378 while (cur < end) { 379 uint32_t ch = *cur++; 380 381 if (NS_IS_HIGH_SURROGATE(ch)) { 382 if (cur < end && NS_IS_LOW_SURROGATE(*cur)) { 383 ch = SURROGATE_TO_UCS4(ch, *cur++); 384 } else { 385 return; // unpaired surrogate: bail out, don't hyphenate broken text 386 } 387 } else if (NS_IS_LOW_SURROGATE(ch)) { 388 return; // unpaired surrogate 389 } 390 391 // XXX What about language-specific casing? Consider Turkish I/i... 392 // In practice, it looks like the current patterns will not be 393 // affected by this, as they treat dotted and undotted i similarly. 394 uint32_t origCh = ch; 395 ch = ToLowerCase(ch); 396 397 if (ch != origCh) { 398 // Avoid hyphenating capitalized words (bug 1550532) unless explicitly 399 // allowed by prefs for the language in use. 400 // Also never auto-hyphenate a word that has internal caps, as it may 401 // well be an all-caps acronym or a quirky name like iTunes. 402 if (!mHyphenateCapitalized || !firstLetter) { 403 return; 404 } 405 } 406 firstLetter = false; 407 408 if (ch < 0x80) { // U+0000 - U+007F 409 utf8.Append(ch); 410 } else if (ch < 0x0800) { // U+0100 - U+07FF 411 utf8.Append(0xC0 | (ch >> 6)); 412 utf8.Append(0x80 | (0x003F & ch)); 413 } else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF 414 utf8.Append(0xE0 | (ch >> 12)); 415 utf8.Append(0x80 | (0x003F & (ch >> 6))); 416 utf8.Append(0x80 | (0x003F & ch)); 417 } else { 418 utf8.Append(0xF0 | (ch >> 18)); 419 utf8.Append(0x80 | (0x003F & (ch >> 12))); 420 utf8.Append(0x80 | (0x003F & (ch >> 6))); 421 utf8.Append(0x80 | (0x003F & ch)); 422 } 423 } 424 425 AutoTArray<uint8_t, 200> hyphenValues; 426 hyphenValues.SetLength(utf8.Length()); 427 int32_t result = mDict.match( 428 [&](Span<const uint8_t>& span) { 429 return mapped_hyph_find_hyphen_values_raw( 430 span.data(), span.size(), utf8.BeginReading(), utf8.Length(), 431 hyphenValues.Elements(), hyphenValues.Length()); 432 }, 433 [&](ipc::ReadOnlySharedMemoryHandle& shm) { 434 // Only the parent process can have a handle stored. We should never 435 // get to this point with just a handle. 436 MOZ_ASSERT_UNREACHABLE("Unexpected HyphenateWord with only a handle"); 437 return 0; 438 }, 439 [&](ipc::ReadOnlySharedMemoryMapping& shm) { 440 return mapped_hyph_find_hyphen_values_raw( 441 shm.DataAs<uint8_t>(), shm.Size(), utf8.BeginReading(), 442 utf8.Length(), hyphenValues.Elements(), hyphenValues.Length()); 443 }, 444 [&](UniquePtr<const HyphDic>& hyph) { 445 return mapped_hyph_find_hyphen_values_dic( 446 hyph.get(), utf8.BeginReading(), utf8.Length(), 447 hyphenValues.Elements(), hyphenValues.Length()); 448 }); 449 if (result > 0) { 450 // We need to convert UTF-8 indexing as used by the hyphenation lib into 451 // UTF-16 indexing of the aHyphens[] array for Gecko. 452 uint32_t utf16index = 0; 453 for (uint32_t utf8index = 0; utf8index < utf8.Length();) { 454 // We know utf8 is valid, so we only need to look at the first byte of 455 // each character to determine its length and the corresponding UTF-16 456 // length to add to utf16index. 457 const uint8_t leadByte = utf8[utf8index]; 458 if (leadByte < 0x80) { 459 utf8index += 1; 460 } else if (leadByte < 0xE0) { 461 utf8index += 2; 462 } else if (leadByte < 0xF0) { 463 utf8index += 3; 464 } else { 465 utf8index += 4; 466 } 467 // The hyphenation value of interest is the one for the last code unit 468 // of the utf-8 character, and is recorded on the last code unit of the 469 // utf-16 character (in the case of a surrogate pair). 470 utf16index += leadByte >= 0xF0 ? 2 : 1; 471 if (utf16index > 0 && (hyphenValues[utf8index - 1] & 0x01)) { 472 aHyphens[aStart + utf16index - 1] = true; 473 } 474 } 475 } 476 } 477 478 ipc::ReadOnlySharedMemoryHandle nsHyphenator::CloneHandle() { 479 MOZ_ASSERT(XRE_IsParentProcess()); 480 481 if (mDict.is<ipc::ReadOnlySharedMemoryHandle>()) { 482 return mDict.as<ipc::ReadOnlySharedMemoryHandle>().Clone(); 483 } 484 return nullptr; 485 }