CharacterEncoding.cpp (29728B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "js/CharacterEncoding.h" 8 9 #include "mozilla/CheckedInt.h" 10 #include "mozilla/DebugOnly.h" 11 #include "mozilla/Latin1.h" 12 #include "mozilla/Maybe.h" 13 #include "mozilla/Range.h" 14 #include "mozilla/Span.h" 15 #include "mozilla/Sprintf.h" 16 #include "mozilla/TextUtils.h" 17 #include "mozilla/Utf8.h" 18 19 #ifndef XP_LINUX 20 // We still support libstd++ versions without codecvt support on Linux. 21 // 22 // When the minimum supported libstd++ version is bumped to 3.4.21, we can 23 // enable the codecvt code path for Linux, too. This should happen in 2024 when 24 // support for CentOS 7 is removed. 25 # include <codecvt> 26 #endif 27 #include <cwchar> 28 #include <limits> 29 #include <locale> 30 #include <type_traits> 31 32 #include "frontend/FrontendContext.h" 33 #include "js/friend/ErrorMessages.h" // js::GetErrorMessage, JSMSG_* 34 #include "util/StringBuilder.h" 35 #include "util/Unicode.h" // unicode::REPLACEMENT_CHARACTER 36 #include "vm/JSContext.h" 37 38 using mozilla::AsChars; 39 using mozilla::AsciiValidUpTo; 40 using mozilla::AsWritableChars; 41 using mozilla::ConvertLatin1toUtf8Partial; 42 using mozilla::ConvertUtf16toUtf8Partial; 43 using mozilla::IsAscii; 44 using mozilla::IsUtf8Latin1; 45 using mozilla::LossyConvertUtf16toLatin1; 46 using mozilla::Span; 47 using mozilla::Utf8Unit; 48 49 using JS::Latin1CharsZ; 50 using JS::TwoByteCharsZ; 51 using JS::UTF8Chars; 52 using JS::UTF8CharsZ; 53 54 using namespace js; 55 using namespace js::unicode; 56 57 Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ( 58 JSContext* cx, const mozilla::Range<const char16_t>& tbchars) { 59 MOZ_ASSERT(cx); 60 size_t len = tbchars.length(); 61 unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1); 62 if (!latin1) { 63 return Latin1CharsZ(); 64 } 65 LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len))); 66 latin1[len] = '\0'; 67 return Latin1CharsZ(latin1, len); 68 } 69 70 template <typename CharT> 71 static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) { 72 size_t nbytes = nchars; 73 for (const CharT* end = chars + nchars; chars < end; chars++) { 74 char16_t c = *chars; 75 if (c < 0x80) { 76 continue; 77 } 78 char32_t v; 79 if (IsSurrogate(c)) { 80 /* nbytes sets 1 length since this is surrogate pair. */ 81 if (IsTrailSurrogate(c) || (chars + 1) == end) { 82 nbytes += 2; /* Bad Surrogate */ 83 continue; 84 } 85 char16_t c2 = chars[1]; 86 if (!IsTrailSurrogate(c2)) { 87 nbytes += 2; /* Bad Surrogate */ 88 continue; 89 } 90 v = UTF16Decode(c, c2); 91 nbytes--; 92 chars++; 93 } else { 94 v = c; 95 } 96 v >>= 11; 97 nbytes++; 98 while (v) { 99 v >>= 5; 100 nbytes++; 101 } 102 } 103 return nbytes; 104 } 105 106 JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) { 107 JS::AutoCheckCannotGC nogc; 108 return s->hasLatin1Chars() 109 ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length()) 110 : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), 111 s->length()); 112 } 113 114 JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src, 115 mozilla::Span<char> dst) { 116 JS::AutoCheckCannotGC nogc; 117 if (src->hasLatin1Chars()) { 118 auto source = AsChars(Span(src->latin1Chars(nogc), src->length())); 119 auto [read, written] = ConvertLatin1toUtf8Partial(source, dst); 120 (void)read; 121 return written; 122 } 123 auto source = Span(src->twoByteChars(nogc), src->length()); 124 auto [read, written] = ConvertUtf16toUtf8Partial(source, dst); 125 (void)read; 126 return written; 127 } 128 129 template <typename CharT> 130 void ConvertToUTF8(mozilla::Span<CharT> src, mozilla::Span<char> dst); 131 132 template <> 133 void ConvertToUTF8<const char16_t>(mozilla::Span<const char16_t> src, 134 mozilla::Span<char> dst) { 135 (void)ConvertUtf16toUtf8Partial(src, dst); 136 } 137 138 template <> 139 void ConvertToUTF8<const Latin1Char>(mozilla::Span<const Latin1Char> src, 140 mozilla::Span<char> dst) { 141 (void)ConvertLatin1toUtf8Partial(AsChars(src), dst); 142 } 143 144 template <typename CharT, typename Allocator> 145 UTF8CharsZ JS::CharsToNewUTF8CharsZ(Allocator* alloc, 146 const mozilla::Range<CharT>& chars) { 147 /* Get required buffer size. */ 148 const CharT* str = chars.begin().get(); 149 size_t len = ::GetDeflatedUTF8StringLength(str, chars.length()); 150 151 /* Allocate buffer. */ 152 char* utf8 = alloc->template pod_malloc<char>(len + 1); 153 if (!utf8) { 154 return UTF8CharsZ(); 155 } 156 157 /* Encode to UTF8. */ 158 ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len)); 159 utf8[len] = '\0'; 160 161 return UTF8CharsZ(utf8, len); 162 } 163 164 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 165 JSContext* cx, const mozilla::Range<Latin1Char>& chars); 166 167 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 168 JSContext* cx, const mozilla::Range<char16_t>& chars); 169 170 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 171 JSContext* cx, const mozilla::Range<const Latin1Char>& chars); 172 173 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 174 JSContext* cx, const mozilla::Range<const char16_t>& chars); 175 176 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 177 FrontendAllocator* cx, const mozilla::Range<Latin1Char>& chars); 178 179 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 180 FrontendAllocator* cx, const mozilla::Range<char16_t>& chars); 181 182 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 183 FrontendAllocator* cx, const mozilla::Range<const Latin1Char>& chars); 184 185 template UTF8CharsZ JS::CharsToNewUTF8CharsZ( 186 FrontendAllocator* cx, const mozilla::Range<const char16_t>& chars); 187 188 static constexpr uint32_t INVALID_UTF8 = std::numeric_limits<char32_t>::max(); 189 190 /* 191 * Convert a UTF-8 character sequence into a UCS-4 character and return that 192 * character. It is assumed that the caller already checked that the sequence 193 * is valid. 194 */ 195 static char32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer, 196 int utf8Length) { 197 MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4); 198 199 if (utf8Length == 1) { 200 MOZ_ASSERT(!(*utf8Buffer & 0x80)); 201 return *utf8Buffer; 202 } 203 204 /* from Unicode 3.1, non-shortest form is illegal */ 205 static const char32_t minucs4Table[] = {0x80, 0x800, NonBMPMin}; 206 207 MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == 208 (0x100 - (1 << (8 - utf8Length)))); 209 char32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); 210 char32_t minucs4Char = minucs4Table[utf8Length - 2]; 211 while (--utf8Length) { 212 MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80); 213 ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); 214 } 215 216 if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) { 217 return INVALID_UTF8; 218 } 219 220 if (MOZ_UNLIKELY(IsSurrogate(ucs4Char))) { 221 return INVALID_UTF8; 222 } 223 224 return ucs4Char; 225 } 226 227 char32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) { 228 return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length); 229 } 230 231 static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) { 232 char buffer[10]; 233 SprintfLiteral(buffer, "%u", offset); 234 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 235 JSMSG_MALFORMED_UTF8_CHAR, buffer); 236 } 237 238 static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) { 239 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 240 JSMSG_BUFFER_TOO_SMALL); 241 } 242 243 static void ReportTooBigCharacter(JSContext* cx, uint32_t v) { 244 char buffer[11]; 245 SprintfLiteral(buffer, "0x%x", v); 246 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 247 JSMSG_UTF8_CHAR_TOO_LARGE, buffer); 248 } 249 250 enum class LoopDisposition { 251 Break, 252 Continue, 253 }; 254 255 enum class OnUTF8Error { 256 InsertReplacementCharacter, 257 InsertQuestionMark, 258 Throw, 259 Crash, 260 }; 261 262 inline bool IsInvalidSecondByte(uint32_t first, uint8_t second) { 263 // Perform an extra check aginst the second byte. 264 // From Unicode Standard v6.2, Table 3-7 Well-Formed UTF-8 Byte Sequences. 265 // 266 // The consumer should perform a followup check for second & 0xC0 == 0x80. 267 return (first == 0xE0 && (second & 0xE0) != 0xA0) || // E0 A0~BF 268 (first == 0xED && (second & 0xE0) != 0x80) || // ED 80~9F 269 (first == 0xF0 && (second & 0xF0) == 0x80) || // F0 90~BF 270 (first == 0xF4 && (second & 0xF0) != 0x80); // F4 80~8F 271 } 272 273 // Scan UTF-8 input and (internally, at least) convert it to a series of UTF-16 274 // code units. But you can also do odd things like pass an empty lambda for 275 // `dst`, in which case the output is discarded entirely--the only effect of 276 // calling the template that way is error-checking. 277 template <OnUTF8Error ErrorAction, typename OutputFn> 278 static bool InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars& src, 279 OutputFn dst) { 280 size_t srclen = src.length(); 281 for (uint32_t i = 0; i < srclen; i++) { 282 uint32_t v = uint32_t(src[i]); 283 if (!(v & 0x80)) { 284 // ASCII code unit. Simple copy. 285 if (dst(uint16_t(v)) == LoopDisposition::Break) { 286 break; 287 } 288 } else { 289 #define INVALID(report, arg, n2) \ 290 do { \ 291 if (ErrorAction == OnUTF8Error::Throw) { \ 292 report(cx, arg); \ 293 return false; \ 294 } else if (ErrorAction == OnUTF8Error::Crash) { \ 295 MOZ_CRASH("invalid UTF-8 string: " #report); \ 296 } else { \ 297 char16_t replacement; \ 298 if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \ 299 replacement = REPLACEMENT_CHARACTER; \ 300 } else { \ 301 MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \ 302 replacement = '?'; \ 303 } \ 304 if (dst(replacement) == LoopDisposition::Break) { \ 305 break; \ 306 } \ 307 n = n2; \ 308 goto invalidMultiByteCodeUnit; \ 309 } \ 310 } while (0) 311 312 // Non-ASCII code unit. Determine its length in bytes (n). 313 // 314 // Avoid undefined behavior from passing in 0 315 // (https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html#index-_005f_005fbuiltin_005fclz) 316 // by turning on the low bit so that 0xff will set n=31-24=7, which will 317 // be detected as an invalid character. 318 uint32_t n = mozilla::CountLeadingZeroes32(~int8_t(src[i]) | 0x1) - 24; 319 320 // Check the leading byte. 321 if (n < 2 || n > 4) { 322 INVALID(ReportInvalidCharacter, i, 1); 323 } 324 325 // Check that |src| is large enough to hold an n-byte code unit. 326 if (i + n > srclen) { 327 // Check the second and continuation bytes, to replace maximal subparts 328 // of an ill-formed subsequence with single U+FFFD. 329 if (i + 2 > srclen) { 330 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); 331 } 332 333 if (IsInvalidSecondByte(v, (uint8_t)src[i + 1])) { 334 INVALID(ReportInvalidCharacter, i, 1); 335 } 336 337 if ((src[i + 1] & 0xC0) != 0x80) { 338 INVALID(ReportInvalidCharacter, i, 1); 339 } 340 341 if (n == 3) { 342 INVALID(ReportInvalidCharacter, i, 2); 343 } else { 344 if (i + 3 > srclen) { 345 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 2); 346 } 347 if ((src[i + 2] & 0xC0) != 0x80) { 348 INVALID(ReportInvalidCharacter, i, 2); 349 } 350 INVALID(ReportInvalidCharacter, i, 3); 351 } 352 } 353 354 if (IsInvalidSecondByte(v, (uint8_t)src[i + 1])) { 355 INVALID(ReportInvalidCharacter, i, 1); 356 } 357 358 // Check the continuation bytes. 359 for (uint32_t m = 1; m < n; m++) { 360 if ((src[i + m] & 0xC0) != 0x80) { 361 INVALID(ReportInvalidCharacter, i, m); 362 } 363 } 364 365 // Determine the code unit's length in CharT and act accordingly. 366 v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n); 367 if (v < NonBMPMin) { 368 // The n-byte UTF8 code unit will fit in a single CharT. 369 if (dst(char16_t(v)) == LoopDisposition::Break) { 370 break; 371 } 372 } else if (v <= NonBMPMax) { 373 // The n-byte UTF8 code unit will fit in two CharT units. 374 if (dst(LeadSurrogate(v)) == LoopDisposition::Break) { 375 break; 376 } 377 if (dst(TrailSurrogate(v)) == LoopDisposition::Break) { 378 break; 379 } 380 } else { 381 // The n-byte UTF8 code unit won't fit in two CharT units. 382 INVALID(ReportTooBigCharacter, v, 1); 383 } 384 385 invalidMultiByteCodeUnit: 386 // Move i to the last byte of the multi-byte code unit; the loop 387 // header will do the final i++ to move to the start of the next 388 // code unit. 389 i += n - 1; 390 } 391 } 392 393 return true; 394 } 395 396 template <OnUTF8Error ErrorAction, typename CharT> 397 static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars& src, 398 CharT* dst, size_t outlen, 399 bool allASCII) { 400 if (allASCII) { 401 size_t srclen = src.length(); 402 MOZ_ASSERT(outlen == srclen); 403 for (uint32_t i = 0; i < srclen; i++) { 404 dst[i] = CharT(src[i]); 405 } 406 } else { 407 size_t j = 0; 408 auto push = [dst, &j](char16_t c) -> LoopDisposition { 409 dst[j++] = CharT(c); 410 return LoopDisposition::Continue; 411 }; 412 MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push))); 413 MOZ_ASSERT(j == outlen); 414 } 415 } 416 417 template <OnUTF8Error ErrorAction, typename CharsT> 418 static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars& src, 419 size_t* outlen, arena_id_t destArenaId) { 420 using CharT = typename CharsT::CharT; 421 static_assert( 422 std::is_same_v<CharT, char16_t> || std::is_same_v<CharT, Latin1Char>, 423 "bad CharT"); 424 425 *outlen = 0; 426 427 size_t len = 0; 428 bool allASCII = true; 429 auto count = [&len, &allASCII](char16_t c) -> LoopDisposition { 430 len++; 431 allASCII &= (c < 0x80); 432 return LoopDisposition::Continue; 433 }; 434 if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) { 435 return CharsT(); 436 } 437 *outlen = len; 438 439 CharT* dst = cx->pod_arena_malloc<CharT>(destArenaId, 440 *outlen + 1); // +1 for NUL 441 442 if (!dst) { 443 ReportOutOfMemory(cx); 444 return CharsT(); 445 } 446 447 constexpr OnUTF8Error errorMode = 448 std::is_same_v<CharT, Latin1Char> 449 ? OnUTF8Error::InsertQuestionMark 450 : OnUTF8Error::InsertReplacementCharacter; 451 CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII); 452 dst[*outlen] = CharT('\0'); 453 454 return CharsT(dst, *outlen); 455 } 456 457 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, 458 const UTF8Chars& utf8, 459 size_t* outlen, 460 arena_id_t destArenaId) { 461 return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>( 462 cx, utf8, outlen, destArenaId); 463 } 464 465 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, 466 const JS::UTF8Chars& utf8, 467 size_t* outlen, 468 arena_id_t destArenaId) { 469 return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter, 470 TwoByteCharsZ>(cx, utf8, outlen, destArenaId); 471 } 472 473 static void UpdateSmallestEncodingForChar(char16_t c, 474 JS::SmallestEncoding* encoding) { 475 JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII; 476 if (c >= 0x80) { 477 if (c < 0x100) { 478 newEncoding = JS::SmallestEncoding::Latin1; 479 } else { 480 newEncoding = JS::SmallestEncoding::UTF16; 481 } 482 } 483 if (newEncoding > *encoding) { 484 *encoding = newEncoding; 485 } 486 } 487 488 JS::SmallestEncoding JS::FindSmallestEncoding(const UTF8Chars& utf8) { 489 Span<const unsigned char> unsignedSpan = utf8; 490 auto charSpan = AsChars(unsignedSpan); 491 size_t upTo = AsciiValidUpTo(charSpan); 492 if (upTo == charSpan.Length()) { 493 return SmallestEncoding::ASCII; 494 } 495 if (IsUtf8Latin1(charSpan.From(upTo))) { 496 return SmallestEncoding::Latin1; 497 } 498 return SmallestEncoding::UTF16; 499 } 500 501 Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, 502 const UTF8Chars& utf8, 503 size_t* outlen, 504 arena_id_t destArenaId) { 505 return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>( 506 cx, utf8, outlen, destArenaId); 507 } 508 509 /** 510 * Atomization Helpers. 511 * 512 * These functions are extremely single-use, and are not intended for general 513 * consumption. 514 */ 515 516 bool GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars& utf8, 517 size_t* outlen, JS::SmallestEncoding* encoding, 518 HashNumber* hashNum) { 519 *outlen = 0; 520 *encoding = JS::SmallestEncoding::ASCII; 521 *hashNum = 0; 522 523 auto getMetadata = [outlen, encoding, 524 hashNum](char16_t c) -> LoopDisposition { 525 (*outlen)++; 526 UpdateSmallestEncodingForChar(c, encoding); 527 *hashNum = mozilla::AddToHash(*hashNum, c); 528 return LoopDisposition::Continue; 529 }; 530 if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) { 531 return false; 532 } 533 534 return true; 535 } 536 537 template <typename CharT> 538 bool UTF8EqualsChars(const JS::UTF8Chars& utfChars, const CharT* chars) { 539 size_t ind = 0; 540 bool isEqual = true; 541 542 auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition { 543 #ifdef DEBUG 544 JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII; 545 UpdateSmallestEncodingForChar(c, &encoding); 546 if (std::is_same_v<CharT, JS::Latin1Char>) { 547 MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1); 548 } else if (!std::is_same_v<CharT, char16_t>) { 549 MOZ_CRASH("Invalid character type in UTF8EqualsChars"); 550 } 551 #endif 552 553 if (CharT(c) != chars[ind]) { 554 isEqual = false; 555 return LoopDisposition::Break; 556 } 557 558 ind++; 559 return LoopDisposition::Continue; 560 }; 561 562 // To get here, you must have checked your work. 563 InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars, 564 checkEqual); 565 566 return isEqual; 567 } 568 569 template bool UTF8EqualsChars(const JS::UTF8Chars&, const char16_t*); 570 template bool UTF8EqualsChars(const JS::UTF8Chars&, const JS::Latin1Char*); 571 572 template <typename CharT> 573 void InflateUTF8CharsToBuffer(const JS::UTF8Chars& src, CharT* dst, 574 size_t dstLen, JS::SmallestEncoding encoding) { 575 CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>( 576 /* cx = */ nullptr, src, dst, dstLen, 577 encoding == JS::SmallestEncoding::ASCII); 578 } 579 580 template void InflateUTF8CharsToBuffer(const UTF8Chars& src, char16_t* dst, 581 size_t dstLen, 582 JS::SmallestEncoding encoding); 583 template void InflateUTF8CharsToBuffer(const UTF8Chars& src, 584 JS::Latin1Char* dst, size_t dstLen, 585 JS::SmallestEncoding encoding); 586 587 #ifdef DEBUG 588 void JS::ConstUTF8CharsZ::validate(size_t aLength) { 589 MOZ_ASSERT(data_); 590 UTF8Chars chars(data_, aLength); 591 auto nop = [](char16_t) -> LoopDisposition { 592 return LoopDisposition::Continue; 593 }; 594 InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop); 595 } 596 void JS::ConstUTF8CharsZ::validateWithoutLength() { 597 MOZ_ASSERT(data_); 598 validate(strlen(data_)); 599 } 600 #endif 601 602 bool JS::StringIsASCII(const char* s) { 603 while (*s) { 604 if (*s & 0x80) { 605 return false; 606 } 607 s++; 608 } 609 return true; 610 } 611 612 bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); } 613 614 JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx, 615 const char* chars) { 616 // Convert the narrow multibyte character string to a wide string and then 617 // use EncodeWideToUtf8() to convert the wide string to a UTF-8 string. 618 619 std::mbstate_t mb{}; 620 621 // NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr 622 // on Android NDK older than v16. Use a temporary variable to save the 623 // `chars` for the subsequent call. See bug 1492090. 624 const char* tmpChars = chars; 625 626 size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb); 627 if (wideLen == size_t(-1)) { 628 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 629 JSMSG_CANT_CONVERT_TO_WIDE); 630 return nullptr; 631 } 632 MOZ_ASSERT(std::mbsinit(&mb), 633 "multi-byte state is in its initial state when no conversion " 634 "error occured"); 635 636 size_t bufLen = wideLen + 1; 637 auto wideChars = cx->make_pod_array<wchar_t>(bufLen); 638 if (!wideChars) { 639 return nullptr; 640 } 641 642 mozilla::DebugOnly<size_t> actualLen = 643 std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb); 644 MOZ_ASSERT(wideLen == actualLen); 645 MOZ_ASSERT(wideChars[actualLen] == '\0'); 646 647 return EncodeWideToUtf8(cx, wideChars.get()); 648 } 649 650 JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx, 651 const wchar_t* chars) { 652 using CheckedSizeT = mozilla::CheckedInt<size_t>; 653 654 #ifndef XP_LINUX 655 // Use the standard codecvt facet to convert a wide string to UTF-8. 656 std::codecvt_utf8<wchar_t> cv; 657 658 size_t len = std::wcslen(chars); 659 CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length(); 660 CheckedSizeT utf8BufLen = utf8MaxLen + 1; 661 if (!utf8BufLen.isValid()) { 662 JS_ReportAllocationOverflow(cx); 663 return nullptr; 664 } 665 auto utf8 = cx->make_pod_array<char>(utf8BufLen.value()); 666 if (!utf8) { 667 return nullptr; 668 } 669 670 // STL returns |codecvt_base::partial| for empty strings. 671 if (len == 0) { 672 utf8[0] = '\0'; // Explicit null-termination required. 673 return utf8; 674 } 675 676 std::mbstate_t mb{}; 677 const wchar_t* fromNext; 678 char* toNext; 679 std::codecvt_base::result result = 680 cv.out(mb, chars, chars + len, fromNext, utf8.get(), 681 utf8.get() + utf8MaxLen.value(), toNext); 682 if (result != std::codecvt_base::ok) { 683 MOZ_ASSERT(result == std::codecvt_base::error); 684 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 685 JSMSG_CANT_CONVERT_WIDE_TO_UTF8); 686 return nullptr; 687 } 688 *toNext = '\0'; // Explicit null-termination required. 689 690 // codecvt_utf8 doesn't validate its output and may produce WTF-8 instead 691 // of UTF-8 on some platforms when the input contains unpaired surrogate 692 // characters. We don't allow this. 693 if (!mozilla::IsUtf8( 694 mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) { 695 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 696 JSMSG_CANT_CONVERT_WIDE_TO_UTF8); 697 return nullptr; 698 } 699 700 return utf8; 701 #else 702 // Alternative code path for Linux, because we still support libstd++ versions 703 // without codecvt support. See also the top comment where <codecvt> is 704 // included. 705 706 static_assert(sizeof(wchar_t) == 4, 707 "Assume wchar_t is UTF-32 on Linux systems"); 708 709 constexpr size_t MaxUtf8CharLength = 4; 710 711 size_t len = std::wcslen(chars); 712 CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength; 713 CheckedSizeT utf8BufLen = utf8MaxLen + 1; 714 if (!utf8BufLen.isValid()) { 715 JS_ReportAllocationOverflow(cx); 716 return nullptr; 717 } 718 auto utf8 = cx->make_pod_array<char>(utf8BufLen.value()); 719 if (!utf8) { 720 return nullptr; 721 } 722 723 char* dst = utf8.get(); 724 for (size_t i = 0; i < len; i++) { 725 uint8_t utf8buf[MaxUtf8CharLength]; 726 uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]); 727 for (size_t j = 0; j < utf8Len; j++) { 728 *dst++ = char(utf8buf[j]); 729 } 730 } 731 *dst = '\0'; 732 733 return utf8; 734 #endif 735 } 736 737 JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx, 738 const char* chars) { 739 // Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and 740 // then convert the resulting wide string to a narrow multibyte character 741 // string. 742 743 auto wideChars = EncodeUtf8ToWide(cx, chars); 744 if (!wideChars) { 745 return nullptr; 746 } 747 748 const wchar_t* cWideChars = wideChars.get(); 749 std::mbstate_t mb{}; 750 size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb); 751 if (narrowLen == size_t(-1)) { 752 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 753 JSMSG_CANT_CONVERT_TO_NARROW); 754 return nullptr; 755 } 756 MOZ_ASSERT(std::mbsinit(&mb), 757 "multi-byte state is in its initial state when no conversion " 758 "error occured"); 759 760 size_t bufLen = narrowLen + 1; 761 auto narrow = cx->make_pod_array<char>(bufLen); 762 if (!narrow) { 763 return nullptr; 764 } 765 766 mozilla::DebugOnly<size_t> actualLen = 767 std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb); 768 MOZ_ASSERT(narrowLen == actualLen); 769 MOZ_ASSERT(narrow[actualLen] == '\0'); 770 771 return narrow; 772 } 773 774 JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx, 775 const char* chars) { 776 // Only valid UTF-8 strings should be passed to this function. 777 MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars)))); 778 779 #ifndef XP_LINUX 780 // Use the standard codecvt facet to convert from UTF-8 to a wide string. 781 std::codecvt_utf8<wchar_t> cv; 782 783 size_t len = strlen(chars); 784 auto wideChars = cx->make_pod_array<wchar_t>(len + 1); 785 if (!wideChars) { 786 return nullptr; 787 } 788 789 // STL returns |codecvt_base::partial| for empty strings. 790 if (len == 0) { 791 wideChars[0] = '\0'; // Explicit null-termination required. 792 return wideChars; 793 } 794 795 std::mbstate_t mb{}; 796 const char* fromNext; 797 wchar_t* toNext; 798 std::codecvt_base::result result = 799 cv.in(mb, chars, chars + len, fromNext, wideChars.get(), 800 wideChars.get() + len, toNext); 801 if (result != std::codecvt_base::ok) { 802 MOZ_ASSERT(result == std::codecvt_base::error); 803 JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, 804 JSMSG_CANT_CONVERT_UTF8_TO_WIDE); 805 return nullptr; 806 } 807 *toNext = '\0'; // Explicit null-termination required. 808 809 return wideChars; 810 #else 811 // Alternative code path for Linux, because we still support libstd++ versions 812 // without codecvt support. See also the top comment where <codecvt> is 813 // included. 814 815 static_assert(sizeof(wchar_t) == 4, 816 "Assume wchar_t is UTF-32 on Linux systems"); 817 818 size_t len = strlen(chars); 819 auto wideChars = cx->make_pod_array<wchar_t>(len + 1); 820 if (!wideChars) { 821 return nullptr; 822 } 823 824 const auto* s = reinterpret_cast<const unsigned char*>(chars); 825 const auto* const limit = s + len; 826 827 wchar_t* dst = wideChars.get(); 828 while (s < limit) { 829 unsigned char c = *s++; 830 831 if (mozilla::IsAscii(c)) { 832 *dst++ = wchar_t(c); 833 continue; 834 } 835 836 mozilla::Utf8Unit utf8(c); 837 mozilla::Maybe<char32_t> codePoint = 838 mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit); 839 MOZ_ASSERT(codePoint.isSome()); 840 *dst++ = wchar_t(*codePoint); 841 } 842 *dst++ = '\0'; 843 844 return wideChars; 845 #endif 846 } 847 848 bool StringBuilder::append(const Utf8Unit* units, size_t len) { 849 MOZ_ASSERT(maybeCx_); 850 851 if (isLatin1()) { 852 Latin1CharBuffer& latin1 = latin1Chars(); 853 854 while (len > 0) { 855 if (!IsAscii(*units)) { 856 break; 857 } 858 859 if (!latin1.append(units->toUnsignedChar())) { 860 return false; 861 } 862 863 ++units; 864 --len; 865 } 866 if (len == 0) { 867 return true; 868 } 869 870 // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to 871 // |latin1|, but it's only possible for [U+0080, U+0100) code points, 872 // and handling the full complexity of UTF-8 only for that very small 873 // additional range isn't worth it. Inflate to two-byte storage before 874 // appending the remaining code points. 875 if (!inflateChars()) { 876 return false; 877 } 878 } 879 880 UTF8Chars remainingUtf8(units, len); 881 882 // Determine how many UTF-16 code units are required to represent the 883 // remaining units. 884 size_t utf16Len = 0; 885 auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition { 886 utf16Len++; 887 return LoopDisposition::Continue; 888 }; 889 if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(maybeCx_, remainingUtf8, 890 countInflated)) { 891 return false; 892 } 893 894 TwoByteCharBuffer& buf = twoByteChars(); 895 896 size_t i = buf.length(); 897 if (!buf.growByUninitialized(utf16Len)) { 898 return false; 899 } 900 MOZ_ASSERT(i + utf16Len == buf.length(), 901 "growByUninitialized assumed to increase length immediately"); 902 903 char16_t* toFill = &buf[i]; 904 auto appendUtf16 = [&toFill](char16_t unit) { 905 *toFill++ = unit; 906 return LoopDisposition::Continue; 907 }; 908 909 MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>( 910 maybeCx_, remainingUtf8, appendUtf16)); 911 MOZ_ASSERT(toFill == buf.end()); 912 return true; 913 }