tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

CharacterEncoding.h (14438B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef js_CharacterEncoding_h
      8 #define js_CharacterEncoding_h
      9 
     10 #include "mozilla/Range.h"
     11 #include "mozilla/Span.h"
     12 
     13 #include "js/TypeDecls.h"
     14 #include "js/Utility.h"
     15 
     16 class JSLinearString;
     17 
     18 namespace mozilla {
     19 union Utf8Unit;
     20 }
     21 
     22 namespace JS {
     23 
     24 /*
     25 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
     26 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
     27 * byte is treated as a 2-byte character, and there is no way to pass in a
     28 * string containing characters beyond U+00FF.
     29 */
     30 class Latin1Chars : public mozilla::Range<Latin1Char> {
     31  typedef mozilla::Range<Latin1Char> Base;
     32 
     33 public:
     34  using CharT = Latin1Char;
     35 
     36  Latin1Chars() = default;
     37  Latin1Chars(char* aBytes, size_t aLength)
     38      : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
     39  Latin1Chars(const Latin1Char* aBytes, size_t aLength)
     40      : Base(const_cast<Latin1Char*>(aBytes), aLength) {}
     41  Latin1Chars(const char* aBytes, size_t aLength)
     42      : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)),
     43             aLength) {}
     44 };
     45 
     46 /*
     47 * Like Latin1Chars, but the chars are const.
     48 */
     49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> {
     50  typedef mozilla::Range<const Latin1Char> Base;
     51 
     52 public:
     53  using CharT = Latin1Char;
     54 
     55  ConstLatin1Chars() = default;
     56  ConstLatin1Chars(const Latin1Char* aChars, size_t aLength)
     57      : Base(aChars, aLength) {}
     58 };
     59 
     60 /*
     61 * A Latin1Chars, but with \0 termination for C compatibility.
     62 */
     63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> {
     64  typedef mozilla::RangedPtr<Latin1Char> Base;
     65 
     66 public:
     67  using CharT = Latin1Char;
     68 
     69  Latin1CharsZ() : Base(nullptr, 0) {}  // NOLINT
     70 
     71  Latin1CharsZ(char* aBytes, size_t aLength)
     72      : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {
     73    MOZ_ASSERT(aBytes[aLength] == '\0');
     74  }
     75 
     76  Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) {
     77    MOZ_ASSERT(aBytes[aLength] == '\0');
     78  }
     79 
     80  using Base::operator=;
     81 
     82  char* c_str() { return reinterpret_cast<char*>(get()); }
     83 };
     84 
     85 class UTF8Chars : public mozilla::Range<unsigned char> {
     86  typedef mozilla::Range<unsigned char> Base;
     87 
     88 public:
     89  using CharT = unsigned char;
     90 
     91  UTF8Chars() = default;
     92  UTF8Chars(char* aBytes, size_t aLength)
     93      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {}
     94  UTF8Chars(const char* aBytes, size_t aLength)
     95      : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)),
     96             aLength) {}
     97  UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength)
     98      : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {}
     99  UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength)
    100      : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {}
    101 };
    102 
    103 /*
    104 * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
    105 */
    106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> {
    107  typedef mozilla::RangedPtr<unsigned char> Base;
    108 
    109 public:
    110  using CharT = unsigned char;
    111 
    112  UTF8CharsZ() : Base(nullptr, 0) {}  // NOLINT
    113 
    114  UTF8CharsZ(char* aBytes, size_t aLength)
    115      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {
    116    MOZ_ASSERT(aBytes[aLength] == '\0');
    117  }
    118 
    119  UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) {
    120    MOZ_ASSERT(aBytes[aLength] == '\0');
    121  }
    122 
    123  UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength)
    124      : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {}
    125 
    126  using Base::operator=;
    127 
    128  char* c_str() { return reinterpret_cast<char*>(get()); }
    129 };
    130 
    131 /*
    132 * A wrapper for a "const char*" that is encoded using UTF-8.
    133 * This class does not manage ownership of the data; that is left
    134 * to others.  This differs from UTF8CharsZ in that the chars are
    135 * const and it disallows assignment.
    136 */
    137 class JS_PUBLIC_API ConstUTF8CharsZ {
    138  const char* data_;
    139 
    140 public:
    141  using CharT = unsigned char;
    142 
    143  ConstUTF8CharsZ() : data_(nullptr) {}
    144 
    145  explicit ConstUTF8CharsZ(const char* aBytes) : data_(aBytes) {
    146 #ifdef DEBUG
    147    if (aBytes) {
    148      validateWithoutLength();
    149    }
    150 #endif
    151  }
    152 
    153  ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) {
    154    MOZ_ASSERT(aBytes[aLength] == '\0');
    155 #ifdef DEBUG
    156    validate(aLength);
    157 #endif
    158  }
    159 
    160  const void* get() const { return data_; }
    161 
    162  const char* c_str() const { return data_; }
    163 
    164  explicit operator bool() const { return data_ != nullptr; }
    165 
    166 private:
    167 #ifdef DEBUG
    168  void validate(size_t aLength);
    169  void validateWithoutLength();
    170 #endif
    171 };
    172 
    173 /*
    174 * SpiderMonkey uses a 2-byte character representation: it is a
    175 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
    176 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
    177 * sufficiently dedicated JavaScript program to be fully unicode-aware by
    178 * manually interpreting UTF-16 extension characters embedded in the JS
    179 * string.
    180 */
    181 class TwoByteChars : public mozilla::Range<char16_t> {
    182  typedef mozilla::Range<char16_t> Base;
    183 
    184 public:
    185  using CharT = char16_t;
    186 
    187  TwoByteChars() = default;
    188  TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
    189  TwoByteChars(const char16_t* aChars, size_t aLength)
    190      : Base(const_cast<char16_t*>(aChars), aLength) {}
    191 };
    192 
    193 /*
    194 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
    195 */
    196 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> {
    197  typedef mozilla::RangedPtr<char16_t> Base;
    198 
    199 public:
    200  using CharT = char16_t;
    201 
    202  TwoByteCharsZ() : Base(nullptr, 0) {}  // NOLINT
    203 
    204  TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) {
    205    MOZ_ASSERT(chars[length] == '\0');
    206  }
    207 
    208  using Base::operator=;
    209 };
    210 
    211 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
    212 
    213 /*
    214 * Like TwoByteChars, but the chars are const.
    215 */
    216 class ConstTwoByteChars : public mozilla::Range<const char16_t> {
    217  typedef mozilla::Range<const char16_t> Base;
    218 
    219 public:
    220  using CharT = char16_t;
    221 
    222  ConstTwoByteChars() = default;
    223  ConstTwoByteChars(const char16_t* aChars, size_t aLength)
    224      : Base(aChars, aLength) {}
    225 };
    226 
    227 /*
    228 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
    229 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
    230 * contains any UTF-16 extension characters, then this may give invalid Latin1
    231 * output. The returned string is zero terminated. The returned string or the
    232 * returned string's |start()| must be freed with JS_free or js_free,
    233 * respectively. If allocation fails, an OOM error will be set and the method
    234 * will return a nullptr chars (which can be tested for with the ! operator).
    235 * This method cannot trigger GC.
    236 */
    237 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(
    238    JSContext* cx, const mozilla::Range<const char16_t>& tbchars);
    239 
    240 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
    241                                                       const char16_t* begin,
    242                                                       size_t length) {
    243  const mozilla::Range<const char16_t> tbchars(begin, length);
    244  return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
    245 }
    246 
    247 template <typename CharT, typename Allocator>
    248 extern UTF8CharsZ CharsToNewUTF8CharsZ(Allocator* alloc,
    249                                       const mozilla::Range<CharT>& chars);
    250 
    251 JS_PUBLIC_API char32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer,
    252                                         int utf8Length);
    253 
    254 /*
    255 * Inflate bytes in UTF-8 encoding to char16_t.
    256 * - On error, returns an empty TwoByteCharsZ.
    257 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
    258 *   its length;  the length value excludes the trailing null.
    259 */
    260 extern JS_PUBLIC_API TwoByteCharsZ
    261 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
    262                            size_t* outlen, arena_id_t destArenaId);
    263 
    264 /*
    265 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8
    266 * characters will be replaced by \uFFFD. No exception will be thrown for
    267 * malformed UTF-8 input.
    268 */
    269 extern JS_PUBLIC_API TwoByteCharsZ
    270 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8,
    271                                 size_t* outlen, arena_id_t destArenaId);
    272 
    273 /*
    274 * Returns the length of the char buffer required to encode |s| as UTF8.
    275 * Does not include the null-terminator.
    276 */
    277 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s);
    278 
    279 /*
    280 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is
    281 * exhausted or too little space is available in |dst| to fit the scalar
    282 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return
    283 * the number of bytes of |dst| that were filled.
    284 *
    285 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already
    286 * linear.
    287 *
    288 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|,
    289 * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted
    290 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is
    291 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|.
    292 *
    293 * The exact space required is always |GetDeflatedUTF8StringLength(str)|.
    294 */
    295 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src,
    296                                               mozilla::Span<char> dst);
    297 
    298 /*
    299 * The smallest character encoding capable of fully representing a particular
    300 * string.
    301 */
    302 enum class SmallestEncoding { ASCII, Latin1, UTF16 };
    303 
    304 /*
    305 * Returns the smallest encoding possible for the given string: if all
    306 * codepoints are <128 then ASCII, otherwise if all codepoints are <256
    307 * Latin-1, else UTF16.
    308 */
    309 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(const UTF8Chars& utf8);
    310 
    311 /*
    312 * Return a null-terminated Latin-1 string copied from the input string,
    313 * storing its length (excluding null terminator) in |*outlen|.  Fail and
    314 * report an error if the string contains non-Latin-1 codepoints.  Returns
    315 * Latin1CharsZ() on failure.
    316 */
    317 extern JS_PUBLIC_API Latin1CharsZ
    318 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, size_t* outlen,
    319                           arena_id_t destArenaId);
    320 
    321 /*
    322 * Returns true if all characters in the given null-terminated string are
    323 * ASCII, i.e. < 0x80, false otherwise.
    324 */
    325 extern JS_PUBLIC_API bool StringIsASCII(const char* s);
    326 
    327 /*
    328 * Returns true if all characters in the given span are ASCII,
    329 * i.e. < 0x80, false otherwise.
    330 */
    331 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
    332 
    333 /**
    334 * Encode a narrow multibyte character string to a UTF-8 string.
    335 *
    336 * NOTE: Should only be used when interacting with POSIX/OS functions and not
    337 *       for encoding ASCII/Latin-1/etc. strings to UTF-8.
    338 */
    339 extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
    340                                                        const char* chars);
    341 
    342 /**
    343 * Encode a wide string to a UTF-8 string.
    344 *
    345 * NOTE: Should only be used when interacting with Windows API functions.
    346 */
    347 extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
    348                                                      const wchar_t* chars);
    349 
    350 /**
    351 * Encode a UTF-8 string to a narrow multibyte character string.
    352 *
    353 * NOTE: Should only be used when interacting with POSIX/OS functions and not
    354 *       for encoding UTF-8 to ASCII/Latin-1/etc. strings.
    355 */
    356 extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
    357                                                        const char* chars);
    358 
    359 /**
    360 * Encode a UTF-8 string to a wide string.
    361 *
    362 * NOTE: Should only be used when interacting with Windows API functions.
    363 */
    364 extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
    365                                                          const char* chars);
    366 
    367 }  // namespace JS
    368 
    369 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
    370 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
    371 
    372 /**
    373 * DEPRECATED
    374 *
    375 * Allocate memory sufficient to contain the characters of |str| truncated to
    376 * Latin-1 and a trailing null terminator, fill the memory with the characters
    377 * interpreted in that manner plus the null terminator, and return a pointer to
    378 * the memory.
    379 *
    380 * This function *loses information* when it copies the characters of |str| if
    381 * |str| contains code units greater than 0xFF.  Additionally, users that
    382 * depend on null-termination will misinterpret the copied characters if |str|
    383 * contains any nulls.  Avoid using this function if possible, because it will
    384 * eventually be removed.
    385 */
    386 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx,
    387                                                             JSString* str);
    388 
    389 /**
    390 * DEPRECATED
    391 *
    392 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string.
    393 *
    394 * This function *loses information* when it copies the characters of |str| if
    395 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied
    396 * instead.
    397 *
    398 * The returned string is also subject to misinterpretation if |str| contains
    399 * any nulls (which are faithfully transcribed into the returned string, but
    400 * which will implicitly truncate the string if it's passed to functions that
    401 * expect null-terminated strings).
    402 *
    403 * Avoid using this function if possible, because we'll remove it once we can
    404 * devise a better API for the task.
    405 */
    406 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8(
    407    JSContext* cx, JS::Handle<JSString*> str);
    408 
    409 /**
    410 * DEPRECATED
    411 *
    412 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string.
    413 *
    414 * This function asserts in debug mode that the input string contains only
    415 * ASCII characters.
    416 *
    417 * The returned string is also subject to misinterpretation if |str| contains
    418 * any nulls (which are faithfully transcribed into the returned string, but
    419 * which will implicitly truncate the string if it's passed to functions that
    420 * expect null-terminated strings).
    421 *
    422 * Avoid using this function if possible, because we'll remove it once we can
    423 * devise a better API for the task.
    424 */
    425 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx,
    426                                                            JSString* str);
    427 
    428 #endif /* js_CharacterEncoding_h */