[ tor-browser ].git.dasho

CharacterEncoding.cpp (29728B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "js/CharacterEncoding.h"
      8 
      9 #include "mozilla/CheckedInt.h"
     10 #include "mozilla/DebugOnly.h"
     11 #include "mozilla/Latin1.h"
     12 #include "mozilla/Maybe.h"
     13 #include "mozilla/Range.h"
     14 #include "mozilla/Span.h"
     15 #include "mozilla/Sprintf.h"
     16 #include "mozilla/TextUtils.h"
     17 #include "mozilla/Utf8.h"
     18 
     19 #ifndef XP_LINUX
     20 // We still support libstd++ versions without codecvt support on Linux.
     21 //
     22 // When the minimum supported libstd++ version is bumped to 3.4.21, we can
     23 // enable the codecvt code path for Linux, too. This should happen in 2024 when
     24 // support for CentOS 7 is removed.
     25 #  include <codecvt>
     26 #endif
     27 #include <cwchar>
     28 #include <limits>
     29 #include <locale>
     30 #include <type_traits>
     31 
     32 #include "frontend/FrontendContext.h"
     33 #include "js/friend/ErrorMessages.h"  // js::GetErrorMessage, JSMSG_*
     34 #include "util/StringBuilder.h"
     35 #include "util/Unicode.h"  // unicode::REPLACEMENT_CHARACTER
     36 #include "vm/JSContext.h"
     37 
     38 using mozilla::AsChars;
     39 using mozilla::AsciiValidUpTo;
     40 using mozilla::AsWritableChars;
     41 using mozilla::ConvertLatin1toUtf8Partial;
     42 using mozilla::ConvertUtf16toUtf8Partial;
     43 using mozilla::IsAscii;
     44 using mozilla::IsUtf8Latin1;
     45 using mozilla::LossyConvertUtf16toLatin1;
     46 using mozilla::Span;
     47 using mozilla::Utf8Unit;
     48 
     49 using JS::Latin1CharsZ;
     50 using JS::TwoByteCharsZ;
     51 using JS::UTF8Chars;
     52 using JS::UTF8CharsZ;
     53 
     54 using namespace js;
     55 using namespace js::unicode;
     56 
     57 Latin1CharsZ JS::LossyTwoByteCharsToNewLatin1CharsZ(
     58    JSContext* cx, const mozilla::Range<const char16_t>& tbchars) {
     59  MOZ_ASSERT(cx);
     60  size_t len = tbchars.length();
     61  unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
     62  if (!latin1) {
     63    return Latin1CharsZ();
     64  }
     65  LossyConvertUtf16toLatin1(tbchars, AsWritableChars(Span(latin1, len)));
     66  latin1[len] = '\0';
     67  return Latin1CharsZ(latin1, len);
     68 }
     69 
     70 template <typename CharT>
     71 static size_t GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars) {
     72  size_t nbytes = nchars;
     73  for (const CharT* end = chars + nchars; chars < end; chars++) {
     74    char16_t c = *chars;
     75    if (c < 0x80) {
     76      continue;
     77    }
     78    char32_t v;
     79    if (IsSurrogate(c)) {
     80      /* nbytes sets 1 length since this is surrogate pair. */
     81      if (IsTrailSurrogate(c) || (chars + 1) == end) {
     82        nbytes += 2; /* Bad Surrogate */
     83        continue;
     84      }
     85      char16_t c2 = chars[1];
     86      if (!IsTrailSurrogate(c2)) {
     87        nbytes += 2; /* Bad Surrogate */
     88        continue;
     89      }
     90      v = UTF16Decode(c, c2);
     91      nbytes--;
     92      chars++;
     93    } else {
     94      v = c;
     95    }
     96    v >>= 11;
     97    nbytes++;
     98    while (v) {
     99      v >>= 5;
    100      nbytes++;
    101    }
    102  }
    103  return nbytes;
    104 }
    105 
    106 JS_PUBLIC_API size_t JS::GetDeflatedUTF8StringLength(JSLinearString* s) {
    107  JS::AutoCheckCannotGC nogc;
    108  return s->hasLatin1Chars()
    109             ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
    110             : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc),
    111                                             s->length());
    112 }
    113 
    114 JS_PUBLIC_API size_t JS::DeflateStringToUTF8Buffer(JSLinearString* src,
    115                                                   mozilla::Span<char> dst) {
    116  JS::AutoCheckCannotGC nogc;
    117  if (src->hasLatin1Chars()) {
    118    auto source = AsChars(Span(src->latin1Chars(nogc), src->length()));
    119    auto [read, written] = ConvertLatin1toUtf8Partial(source, dst);
    120    (void)read;
    121    return written;
    122  }
    123  auto source = Span(src->twoByteChars(nogc), src->length());
    124  auto [read, written] = ConvertUtf16toUtf8Partial(source, dst);
    125  (void)read;
    126  return written;
    127 }
    128 
    129 template <typename CharT>
    130 void ConvertToUTF8(mozilla::Span<CharT> src, mozilla::Span<char> dst);
    131 
    132 template <>
    133 void ConvertToUTF8<const char16_t>(mozilla::Span<const char16_t> src,
    134                                   mozilla::Span<char> dst) {
    135  (void)ConvertUtf16toUtf8Partial(src, dst);
    136 }
    137 
    138 template <>
    139 void ConvertToUTF8<const Latin1Char>(mozilla::Span<const Latin1Char> src,
    140                                     mozilla::Span<char> dst) {
    141  (void)ConvertLatin1toUtf8Partial(AsChars(src), dst);
    142 }
    143 
    144 template <typename CharT, typename Allocator>
    145 UTF8CharsZ JS::CharsToNewUTF8CharsZ(Allocator* alloc,
    146                                    const mozilla::Range<CharT>& chars) {
    147  /* Get required buffer size. */
    148  const CharT* str = chars.begin().get();
    149  size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
    150 
    151  /* Allocate buffer. */
    152  char* utf8 = alloc->template pod_malloc<char>(len + 1);
    153  if (!utf8) {
    154    return UTF8CharsZ();
    155  }
    156 
    157  /* Encode to UTF8. */
    158  ::ConvertToUTF8(Span(str, chars.length()), Span(utf8, len));
    159  utf8[len] = '\0';
    160 
    161  return UTF8CharsZ(utf8, len);
    162 }
    163 
    164 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    165    JSContext* cx, const mozilla::Range<Latin1Char>& chars);
    166 
    167 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    168    JSContext* cx, const mozilla::Range<char16_t>& chars);
    169 
    170 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    171    JSContext* cx, const mozilla::Range<const Latin1Char>& chars);
    172 
    173 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    174    JSContext* cx, const mozilla::Range<const char16_t>& chars);
    175 
    176 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    177    FrontendAllocator* cx, const mozilla::Range<Latin1Char>& chars);
    178 
    179 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    180    FrontendAllocator* cx, const mozilla::Range<char16_t>& chars);
    181 
    182 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    183    FrontendAllocator* cx, const mozilla::Range<const Latin1Char>& chars);
    184 
    185 template UTF8CharsZ JS::CharsToNewUTF8CharsZ(
    186    FrontendAllocator* cx, const mozilla::Range<const char16_t>& chars);
    187 
    188 static constexpr uint32_t INVALID_UTF8 = std::numeric_limits<char32_t>::max();
    189 
    190 /*
    191 * Convert a UTF-8 character sequence into a UCS-4 character and return that
    192 * character. It is assumed that the caller already checked that the sequence
    193 * is valid.
    194 */
    195 static char32_t Utf8ToOneUcs4CharImpl(const uint8_t* utf8Buffer,
    196                                      int utf8Length) {
    197  MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
    198 
    199  if (utf8Length == 1) {
    200    MOZ_ASSERT(!(*utf8Buffer & 0x80));
    201    return *utf8Buffer;
    202  }
    203 
    204  /* from Unicode 3.1, non-shortest form is illegal */
    205  static const char32_t minucs4Table[] = {0x80, 0x800, NonBMPMin};
    206 
    207  MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
    208             (0x100 - (1 << (8 - utf8Length))));
    209  char32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
    210  char32_t minucs4Char = minucs4Table[utf8Length - 2];
    211  while (--utf8Length) {
    212    MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
    213    ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
    214  }
    215 
    216  if (MOZ_UNLIKELY(ucs4Char < minucs4Char)) {
    217    return INVALID_UTF8;
    218  }
    219 
    220  if (MOZ_UNLIKELY(IsSurrogate(ucs4Char))) {
    221    return INVALID_UTF8;
    222  }
    223 
    224  return ucs4Char;
    225 }
    226 
    227 char32_t JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length) {
    228  return Utf8ToOneUcs4CharImpl(utf8Buffer, utf8Length);
    229 }
    230 
    231 static void ReportInvalidCharacter(JSContext* cx, uint32_t offset) {
    232  char buffer[10];
    233  SprintfLiteral(buffer, "%u", offset);
    234  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    235                            JSMSG_MALFORMED_UTF8_CHAR, buffer);
    236 }
    237 
    238 static void ReportBufferTooSmall(JSContext* cx, uint32_t dummy) {
    239  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    240                            JSMSG_BUFFER_TOO_SMALL);
    241 }
    242 
    243 static void ReportTooBigCharacter(JSContext* cx, uint32_t v) {
    244  char buffer[11];
    245  SprintfLiteral(buffer, "0x%x", v);
    246  JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    247                            JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
    248 }
    249 
    250 enum class LoopDisposition {
    251  Break,
    252  Continue,
    253 };
    254 
    255 enum class OnUTF8Error {
    256  InsertReplacementCharacter,
    257  InsertQuestionMark,
    258  Throw,
    259  Crash,
    260 };
    261 
    262 inline bool IsInvalidSecondByte(uint32_t first, uint8_t second) {
    263  // Perform an extra check aginst the second byte.
    264  // From Unicode Standard v6.2, Table 3-7 Well-Formed UTF-8 Byte Sequences.
    265  //
    266  // The consumer should perform a followup check for second & 0xC0 == 0x80.
    267  return (first == 0xE0 && (second & 0xE0) != 0xA0) ||  // E0 A0~BF
    268         (first == 0xED && (second & 0xE0) != 0x80) ||  // ED 80~9F
    269         (first == 0xF0 && (second & 0xF0) == 0x80) ||  // F0 90~BF
    270         (first == 0xF4 && (second & 0xF0) != 0x80);    // F4 80~8F
    271 }
    272 
    273 // Scan UTF-8 input and (internally, at least) convert it to a series of UTF-16
    274 // code units. But you can also do odd things like pass an empty lambda for
    275 // `dst`, in which case the output is discarded entirely--the only effect of
    276 // calling the template that way is error-checking.
    277 template <OnUTF8Error ErrorAction, typename OutputFn>
    278 static bool InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars& src,
    279                               OutputFn dst) {
    280  size_t srclen = src.length();
    281  for (uint32_t i = 0; i < srclen; i++) {
    282    uint32_t v = uint32_t(src[i]);
    283    if (!(v & 0x80)) {
    284      // ASCII code unit.  Simple copy.
    285      if (dst(uint16_t(v)) == LoopDisposition::Break) {
    286        break;
    287      }
    288    } else {
    289 #define INVALID(report, arg, n2)                                    \
    290  do {                                                              \
    291    if (ErrorAction == OnUTF8Error::Throw) {                        \
    292      report(cx, arg);                                              \
    293      return false;                                                 \
    294    } else if (ErrorAction == OnUTF8Error::Crash) {                 \
    295      MOZ_CRASH("invalid UTF-8 string: " #report);                  \
    296    } else {                                                        \
    297      char16_t replacement;                                         \
    298      if (ErrorAction == OnUTF8Error::InsertReplacementCharacter) { \
    299        replacement = REPLACEMENT_CHARACTER;                        \
    300      } else {                                                      \
    301        MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
    302        replacement = '?';                                          \
    303      }                                                             \
    304      if (dst(replacement) == LoopDisposition::Break) {             \
    305        break;                                                      \
    306      }                                                             \
    307      n = n2;                                                       \
    308      goto invalidMultiByteCodeUnit;                                \
    309    }                                                               \
    310  } while (0)
    311 
    312      // Non-ASCII code unit. Determine its length in bytes (n).
    313      //
    314      // Avoid undefined behavior from passing in 0
    315      // (https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html#index-_005f_005fbuiltin_005fclz)
    316      // by turning on the low bit so that 0xff will set n=31-24=7, which will
    317      // be detected as an invalid character.
    318      uint32_t n = mozilla::CountLeadingZeroes32(~int8_t(src[i]) | 0x1) - 24;
    319 
    320      // Check the leading byte.
    321      if (n < 2 || n > 4) {
    322        INVALID(ReportInvalidCharacter, i, 1);
    323      }
    324 
    325      // Check that |src| is large enough to hold an n-byte code unit.
    326      if (i + n > srclen) {
    327        // Check the second and continuation bytes, to replace maximal subparts
    328        // of an ill-formed subsequence with single U+FFFD.
    329        if (i + 2 > srclen) {
    330          INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
    331        }
    332 
    333        if (IsInvalidSecondByte(v, (uint8_t)src[i + 1])) {
    334          INVALID(ReportInvalidCharacter, i, 1);
    335        }
    336 
    337        if ((src[i + 1] & 0xC0) != 0x80) {
    338          INVALID(ReportInvalidCharacter, i, 1);
    339        }
    340 
    341        if (n == 3) {
    342          INVALID(ReportInvalidCharacter, i, 2);
    343        } else {
    344          if (i + 3 > srclen) {
    345            INVALID(ReportBufferTooSmall, /* dummy = */ 0, 2);
    346          }
    347          if ((src[i + 2] & 0xC0) != 0x80) {
    348            INVALID(ReportInvalidCharacter, i, 2);
    349          }
    350          INVALID(ReportInvalidCharacter, i, 3);
    351        }
    352      }
    353 
    354      if (IsInvalidSecondByte(v, (uint8_t)src[i + 1])) {
    355        INVALID(ReportInvalidCharacter, i, 1);
    356      }
    357 
    358      // Check the continuation bytes.
    359      for (uint32_t m = 1; m < n; m++) {
    360        if ((src[i + m] & 0xC0) != 0x80) {
    361          INVALID(ReportInvalidCharacter, i, m);
    362        }
    363      }
    364 
    365      // Determine the code unit's length in CharT and act accordingly.
    366      v = Utf8ToOneUcs4CharImpl((uint8_t*)&src[i], n);
    367      if (v < NonBMPMin) {
    368        // The n-byte UTF8 code unit will fit in a single CharT.
    369        if (dst(char16_t(v)) == LoopDisposition::Break) {
    370          break;
    371        }
    372      } else if (v <= NonBMPMax) {
    373        // The n-byte UTF8 code unit will fit in two CharT units.
    374        if (dst(LeadSurrogate(v)) == LoopDisposition::Break) {
    375          break;
    376        }
    377        if (dst(TrailSurrogate(v)) == LoopDisposition::Break) {
    378          break;
    379        }
    380      } else {
    381        // The n-byte UTF8 code unit won't fit in two CharT units.
    382        INVALID(ReportTooBigCharacter, v, 1);
    383      }
    384 
    385    invalidMultiByteCodeUnit:
    386      // Move i to the last byte of the multi-byte code unit; the loop
    387      // header will do the final i++ to move to the start of the next
    388      // code unit.
    389      i += n - 1;
    390    }
    391  }
    392 
    393  return true;
    394 }
    395 
    396 template <OnUTF8Error ErrorAction, typename CharT>
    397 static void CopyAndInflateUTF8IntoBuffer(JSContext* cx, const UTF8Chars& src,
    398                                         CharT* dst, size_t outlen,
    399                                         bool allASCII) {
    400  if (allASCII) {
    401    size_t srclen = src.length();
    402    MOZ_ASSERT(outlen == srclen);
    403    for (uint32_t i = 0; i < srclen; i++) {
    404      dst[i] = CharT(src[i]);
    405    }
    406  } else {
    407    size_t j = 0;
    408    auto push = [dst, &j](char16_t c) -> LoopDisposition {
    409      dst[j++] = CharT(c);
    410      return LoopDisposition::Continue;
    411    };
    412    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<ErrorAction>(cx, src, push)));
    413    MOZ_ASSERT(j == outlen);
    414  }
    415 }
    416 
    417 template <OnUTF8Error ErrorAction, typename CharsT>
    418 static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars& src,
    419                                      size_t* outlen, arena_id_t destArenaId) {
    420  using CharT = typename CharsT::CharT;
    421  static_assert(
    422      std::is_same_v<CharT, char16_t> || std::is_same_v<CharT, Latin1Char>,
    423      "bad CharT");
    424 
    425  *outlen = 0;
    426 
    427  size_t len = 0;
    428  bool allASCII = true;
    429  auto count = [&len, &allASCII](char16_t c) -> LoopDisposition {
    430    len++;
    431    allASCII &= (c < 0x80);
    432    return LoopDisposition::Continue;
    433  };
    434  if (!InflateUTF8ToUTF16<ErrorAction>(cx, src, count)) {
    435    return CharsT();
    436  }
    437  *outlen = len;
    438 
    439  CharT* dst = cx->pod_arena_malloc<CharT>(destArenaId,
    440                                           *outlen + 1);  // +1 for NUL
    441 
    442  if (!dst) {
    443    ReportOutOfMemory(cx);
    444    return CharsT();
    445  }
    446 
    447  constexpr OnUTF8Error errorMode =
    448      std::is_same_v<CharT, Latin1Char>
    449          ? OnUTF8Error::InsertQuestionMark
    450          : OnUTF8Error::InsertReplacementCharacter;
    451  CopyAndInflateUTF8IntoBuffer<errorMode>(cx, src, dst, *outlen, allASCII);
    452  dst[*outlen] = CharT('\0');
    453 
    454  return CharsT(dst, *outlen);
    455 }
    456 
    457 TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx,
    458                                              const UTF8Chars& utf8,
    459                                              size_t* outlen,
    460                                              arena_id_t destArenaId) {
    461  return InflateUTF8StringHelper<OnUTF8Error::Throw, TwoByteCharsZ>(
    462      cx, utf8, outlen, destArenaId);
    463 }
    464 
    465 TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx,
    466                                                   const JS::UTF8Chars& utf8,
    467                                                   size_t* outlen,
    468                                                   arena_id_t destArenaId) {
    469  return InflateUTF8StringHelper<OnUTF8Error::InsertReplacementCharacter,
    470                                 TwoByteCharsZ>(cx, utf8, outlen, destArenaId);
    471 }
    472 
    473 static void UpdateSmallestEncodingForChar(char16_t c,
    474                                          JS::SmallestEncoding* encoding) {
    475  JS::SmallestEncoding newEncoding = JS::SmallestEncoding::ASCII;
    476  if (c >= 0x80) {
    477    if (c < 0x100) {
    478      newEncoding = JS::SmallestEncoding::Latin1;
    479    } else {
    480      newEncoding = JS::SmallestEncoding::UTF16;
    481    }
    482  }
    483  if (newEncoding > *encoding) {
    484    *encoding = newEncoding;
    485  }
    486 }
    487 
    488 JS::SmallestEncoding JS::FindSmallestEncoding(const UTF8Chars& utf8) {
    489  Span<const unsigned char> unsignedSpan = utf8;
    490  auto charSpan = AsChars(unsignedSpan);
    491  size_t upTo = AsciiValidUpTo(charSpan);
    492  if (upTo == charSpan.Length()) {
    493    return SmallestEncoding::ASCII;
    494  }
    495  if (IsUtf8Latin1(charSpan.From(upTo))) {
    496    return SmallestEncoding::Latin1;
    497  }
    498  return SmallestEncoding::UTF16;
    499 }
    500 
    501 Latin1CharsZ JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx,
    502                                            const UTF8Chars& utf8,
    503                                            size_t* outlen,
    504                                            arena_id_t destArenaId) {
    505  return InflateUTF8StringHelper<OnUTF8Error::Throw, Latin1CharsZ>(
    506      cx, utf8, outlen, destArenaId);
    507 }
    508 
    509 /**
    510 * Atomization Helpers.
    511 *
    512 * These functions are extremely single-use, and are not intended for general
    513 * consumption.
    514 */
    515 
    516 bool GetUTF8AtomizationData(JSContext* cx, const JS::UTF8Chars& utf8,
    517                            size_t* outlen, JS::SmallestEncoding* encoding,
    518                            HashNumber* hashNum) {
    519  *outlen = 0;
    520  *encoding = JS::SmallestEncoding::ASCII;
    521  *hashNum = 0;
    522 
    523  auto getMetadata = [outlen, encoding,
    524                      hashNum](char16_t c) -> LoopDisposition {
    525    (*outlen)++;
    526    UpdateSmallestEncodingForChar(c, encoding);
    527    *hashNum = mozilla::AddToHash(*hashNum, c);
    528    return LoopDisposition::Continue;
    529  };
    530  if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(cx, utf8, getMetadata)) {
    531    return false;
    532  }
    533 
    534  return true;
    535 }
    536 
    537 template <typename CharT>
    538 bool UTF8EqualsChars(const JS::UTF8Chars& utfChars, const CharT* chars) {
    539  size_t ind = 0;
    540  bool isEqual = true;
    541 
    542  auto checkEqual = [&isEqual, &ind, chars](char16_t c) -> LoopDisposition {
    543 #ifdef DEBUG
    544    JS::SmallestEncoding encoding = JS::SmallestEncoding::ASCII;
    545    UpdateSmallestEncodingForChar(c, &encoding);
    546    if (std::is_same_v<CharT, JS::Latin1Char>) {
    547      MOZ_ASSERT(encoding <= JS::SmallestEncoding::Latin1);
    548    } else if (!std::is_same_v<CharT, char16_t>) {
    549      MOZ_CRASH("Invalid character type in UTF8EqualsChars");
    550    }
    551 #endif
    552 
    553    if (CharT(c) != chars[ind]) {
    554      isEqual = false;
    555      return LoopDisposition::Break;
    556    }
    557 
    558    ind++;
    559    return LoopDisposition::Continue;
    560  };
    561 
    562  // To get here, you must have checked your work.
    563  InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, utfChars,
    564                                         checkEqual);
    565 
    566  return isEqual;
    567 }
    568 
    569 template bool UTF8EqualsChars(const JS::UTF8Chars&, const char16_t*);
    570 template bool UTF8EqualsChars(const JS::UTF8Chars&, const JS::Latin1Char*);
    571 
    572 template <typename CharT>
    573 void InflateUTF8CharsToBuffer(const JS::UTF8Chars& src, CharT* dst,
    574                              size_t dstLen, JS::SmallestEncoding encoding) {
    575  CopyAndInflateUTF8IntoBuffer<OnUTF8Error::Crash>(
    576      /* cx = */ nullptr, src, dst, dstLen,
    577      encoding == JS::SmallestEncoding::ASCII);
    578 }
    579 
    580 template void InflateUTF8CharsToBuffer(const UTF8Chars& src, char16_t* dst,
    581                                       size_t dstLen,
    582                                       JS::SmallestEncoding encoding);
    583 template void InflateUTF8CharsToBuffer(const UTF8Chars& src,
    584                                       JS::Latin1Char* dst, size_t dstLen,
    585                                       JS::SmallestEncoding encoding);
    586 
    587 #ifdef DEBUG
    588 void JS::ConstUTF8CharsZ::validate(size_t aLength) {
    589  MOZ_ASSERT(data_);
    590  UTF8Chars chars(data_, aLength);
    591  auto nop = [](char16_t) -> LoopDisposition {
    592    return LoopDisposition::Continue;
    593  };
    594  InflateUTF8ToUTF16<OnUTF8Error::Crash>(/* cx = */ nullptr, chars, nop);
    595 }
    596 void JS::ConstUTF8CharsZ::validateWithoutLength() {
    597  MOZ_ASSERT(data_);
    598  validate(strlen(data_));
    599 }
    600 #endif
    601 
    602 bool JS::StringIsASCII(const char* s) {
    603  while (*s) {
    604    if (*s & 0x80) {
    605      return false;
    606    }
    607    s++;
    608  }
    609  return true;
    610 }
    611 
    612 bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
    613 
    614 JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx,
    615                                                     const char* chars) {
    616  // Convert the narrow multibyte character string to a wide string and then
    617  // use EncodeWideToUtf8() to convert the wide string to a UTF-8 string.
    618 
    619  std::mbstate_t mb{};
    620 
    621  // NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr
    622  //       on Android NDK older than v16.  Use a temporary variable to save the
    623  //       `chars` for the subsequent call.  See bug 1492090.
    624  const char* tmpChars = chars;
    625 
    626  size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb);
    627  if (wideLen == size_t(-1)) {
    628    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    629                              JSMSG_CANT_CONVERT_TO_WIDE);
    630    return nullptr;
    631  }
    632  MOZ_ASSERT(std::mbsinit(&mb),
    633             "multi-byte state is in its initial state when no conversion "
    634             "error occured");
    635 
    636  size_t bufLen = wideLen + 1;
    637  auto wideChars = cx->make_pod_array<wchar_t>(bufLen);
    638  if (!wideChars) {
    639    return nullptr;
    640  }
    641 
    642  mozilla::DebugOnly<size_t> actualLen =
    643      std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb);
    644  MOZ_ASSERT(wideLen == actualLen);
    645  MOZ_ASSERT(wideChars[actualLen] == '\0');
    646 
    647  return EncodeWideToUtf8(cx, wideChars.get());
    648 }
    649 
    650 JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx,
    651                                                   const wchar_t* chars) {
    652  using CheckedSizeT = mozilla::CheckedInt<size_t>;
    653 
    654 #ifndef XP_LINUX
    655  // Use the standard codecvt facet to convert a wide string to UTF-8.
    656  std::codecvt_utf8<wchar_t> cv;
    657 
    658  size_t len = std::wcslen(chars);
    659  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length();
    660  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
    661  if (!utf8BufLen.isValid()) {
    662    JS_ReportAllocationOverflow(cx);
    663    return nullptr;
    664  }
    665  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
    666  if (!utf8) {
    667    return nullptr;
    668  }
    669 
    670  // STL returns |codecvt_base::partial| for empty strings.
    671  if (len == 0) {
    672    utf8[0] = '\0';  // Explicit null-termination required.
    673    return utf8;
    674  }
    675 
    676  std::mbstate_t mb{};
    677  const wchar_t* fromNext;
    678  char* toNext;
    679  std::codecvt_base::result result =
    680      cv.out(mb, chars, chars + len, fromNext, utf8.get(),
    681             utf8.get() + utf8MaxLen.value(), toNext);
    682  if (result != std::codecvt_base::ok) {
    683    MOZ_ASSERT(result == std::codecvt_base::error);
    684    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    685                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
    686    return nullptr;
    687  }
    688  *toNext = '\0';  // Explicit null-termination required.
    689 
    690  // codecvt_utf8 doesn't validate its output and may produce WTF-8 instead
    691  // of UTF-8 on some platforms when the input contains unpaired surrogate
    692  // characters. We don't allow this.
    693  if (!mozilla::IsUtf8(
    694          mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) {
    695    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    696                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
    697    return nullptr;
    698  }
    699 
    700  return utf8;
    701 #else
    702  // Alternative code path for Linux, because we still support libstd++ versions
    703  // without codecvt support. See also the top comment where <codecvt> is
    704  // included.
    705 
    706  static_assert(sizeof(wchar_t) == 4,
    707                "Assume wchar_t is UTF-32 on Linux systems");
    708 
    709  constexpr size_t MaxUtf8CharLength = 4;
    710 
    711  size_t len = std::wcslen(chars);
    712  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength;
    713  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
    714  if (!utf8BufLen.isValid()) {
    715    JS_ReportAllocationOverflow(cx);
    716    return nullptr;
    717  }
    718  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
    719  if (!utf8) {
    720    return nullptr;
    721  }
    722 
    723  char* dst = utf8.get();
    724  for (size_t i = 0; i < len; i++) {
    725    uint8_t utf8buf[MaxUtf8CharLength];
    726    uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]);
    727    for (size_t j = 0; j < utf8Len; j++) {
    728      *dst++ = char(utf8buf[j]);
    729    }
    730  }
    731  *dst = '\0';
    732 
    733  return utf8;
    734 #endif
    735 }
    736 
    737 JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx,
    738                                                     const char* chars) {
    739  // Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and
    740  // then convert the resulting wide string to a narrow multibyte character
    741  // string.
    742 
    743  auto wideChars = EncodeUtf8ToWide(cx, chars);
    744  if (!wideChars) {
    745    return nullptr;
    746  }
    747 
    748  const wchar_t* cWideChars = wideChars.get();
    749  std::mbstate_t mb{};
    750  size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb);
    751  if (narrowLen == size_t(-1)) {
    752    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    753                              JSMSG_CANT_CONVERT_TO_NARROW);
    754    return nullptr;
    755  }
    756  MOZ_ASSERT(std::mbsinit(&mb),
    757             "multi-byte state is in its initial state when no conversion "
    758             "error occured");
    759 
    760  size_t bufLen = narrowLen + 1;
    761  auto narrow = cx->make_pod_array<char>(bufLen);
    762  if (!narrow) {
    763    return nullptr;
    764  }
    765 
    766  mozilla::DebugOnly<size_t> actualLen =
    767      std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb);
    768  MOZ_ASSERT(narrowLen == actualLen);
    769  MOZ_ASSERT(narrow[actualLen] == '\0');
    770 
    771  return narrow;
    772 }
    773 
    774 JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx,
    775                                                       const char* chars) {
    776  // Only valid UTF-8 strings should be passed to this function.
    777  MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars))));
    778 
    779 #ifndef XP_LINUX
    780  // Use the standard codecvt facet to convert from UTF-8 to a wide string.
    781  std::codecvt_utf8<wchar_t> cv;
    782 
    783  size_t len = strlen(chars);
    784  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
    785  if (!wideChars) {
    786    return nullptr;
    787  }
    788 
    789  // STL returns |codecvt_base::partial| for empty strings.
    790  if (len == 0) {
    791    wideChars[0] = '\0';  // Explicit null-termination required.
    792    return wideChars;
    793  }
    794 
    795  std::mbstate_t mb{};
    796  const char* fromNext;
    797  wchar_t* toNext;
    798  std::codecvt_base::result result =
    799      cv.in(mb, chars, chars + len, fromNext, wideChars.get(),
    800            wideChars.get() + len, toNext);
    801  if (result != std::codecvt_base::ok) {
    802    MOZ_ASSERT(result == std::codecvt_base::error);
    803    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
    804                              JSMSG_CANT_CONVERT_UTF8_TO_WIDE);
    805    return nullptr;
    806  }
    807  *toNext = '\0';  // Explicit null-termination required.
    808 
    809  return wideChars;
    810 #else
    811  // Alternative code path for Linux, because we still support libstd++ versions
    812  // without codecvt support. See also the top comment where <codecvt> is
    813  // included.
    814 
    815  static_assert(sizeof(wchar_t) == 4,
    816                "Assume wchar_t is UTF-32 on Linux systems");
    817 
    818  size_t len = strlen(chars);
    819  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
    820  if (!wideChars) {
    821    return nullptr;
    822  }
    823 
    824  const auto* s = reinterpret_cast<const unsigned char*>(chars);
    825  const auto* const limit = s + len;
    826 
    827  wchar_t* dst = wideChars.get();
    828  while (s < limit) {
    829    unsigned char c = *s++;
    830 
    831    if (mozilla::IsAscii(c)) {
    832      *dst++ = wchar_t(c);
    833      continue;
    834    }
    835 
    836    mozilla::Utf8Unit utf8(c);
    837    mozilla::Maybe<char32_t> codePoint =
    838        mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit);
    839    MOZ_ASSERT(codePoint.isSome());
    840    *dst++ = wchar_t(*codePoint);
    841  }
    842  *dst++ = '\0';
    843 
    844  return wideChars;
    845 #endif
    846 }
    847 
    848 bool StringBuilder::append(const Utf8Unit* units, size_t len) {
    849  MOZ_ASSERT(maybeCx_);
    850 
    851  if (isLatin1()) {
    852    Latin1CharBuffer& latin1 = latin1Chars();
    853 
    854    while (len > 0) {
    855      if (!IsAscii(*units)) {
    856        break;
    857      }
    858 
    859      if (!latin1.append(units->toUnsignedChar())) {
    860        return false;
    861      }
    862 
    863      ++units;
    864      --len;
    865    }
    866    if (len == 0) {
    867      return true;
    868    }
    869 
    870    // Non-ASCII doesn't *necessarily* mean we couldn't keep appending to
    871    // |latin1|, but it's only possible for [U+0080, U+0100) code points,
    872    // and handling the full complexity of UTF-8 only for that very small
    873    // additional range isn't worth it.  Inflate to two-byte storage before
    874    // appending the remaining code points.
    875    if (!inflateChars()) {
    876      return false;
    877    }
    878  }
    879 
    880  UTF8Chars remainingUtf8(units, len);
    881 
    882  // Determine how many UTF-16 code units are required to represent the
    883  // remaining units.
    884  size_t utf16Len = 0;
    885  auto countInflated = [&utf16Len](char16_t c) -> LoopDisposition {
    886    utf16Len++;
    887    return LoopDisposition::Continue;
    888  };
    889  if (!InflateUTF8ToUTF16<OnUTF8Error::Throw>(maybeCx_, remainingUtf8,
    890                                              countInflated)) {
    891    return false;
    892  }
    893 
    894  TwoByteCharBuffer& buf = twoByteChars();
    895 
    896  size_t i = buf.length();
    897  if (!buf.growByUninitialized(utf16Len)) {
    898    return false;
    899  }
    900  MOZ_ASSERT(i + utf16Len == buf.length(),
    901             "growByUninitialized assumed to increase length immediately");
    902 
    903  char16_t* toFill = &buf[i];
    904  auto appendUtf16 = [&toFill](char16_t unit) {
    905    *toFill++ = unit;
    906    return LoopDisposition::Continue;
    907  };
    908 
    909  MOZ_ALWAYS_TRUE(InflateUTF8ToUTF16<OnUTF8Error::Throw>(
    910      maybeCx_, remainingUtf8, appendUtf16));
    911  MOZ_ASSERT(toFill == buf.end());
    912  return true;
    913 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE