CharacterEncoding.h (14438B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef js_CharacterEncoding_h 8 #define js_CharacterEncoding_h 9 10 #include "mozilla/Range.h" 11 #include "mozilla/Span.h" 12 13 #include "js/TypeDecls.h" 14 #include "js/Utility.h" 15 16 class JSLinearString; 17 18 namespace mozilla { 19 union Utf8Unit; 20 } 21 22 namespace JS { 23 24 /* 25 * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI 26 * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each 27 * byte is treated as a 2-byte character, and there is no way to pass in a 28 * string containing characters beyond U+00FF. 29 */ 30 class Latin1Chars : public mozilla::Range<Latin1Char> { 31 typedef mozilla::Range<Latin1Char> Base; 32 33 public: 34 using CharT = Latin1Char; 35 36 Latin1Chars() = default; 37 Latin1Chars(char* aBytes, size_t aLength) 38 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {} 39 Latin1Chars(const Latin1Char* aBytes, size_t aLength) 40 : Base(const_cast<Latin1Char*>(aBytes), aLength) {} 41 Latin1Chars(const char* aBytes, size_t aLength) 42 : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), 43 aLength) {} 44 }; 45 46 /* 47 * Like Latin1Chars, but the chars are const. 48 */ 49 class ConstLatin1Chars : public mozilla::Range<const Latin1Char> { 50 typedef mozilla::Range<const Latin1Char> Base; 51 52 public: 53 using CharT = Latin1Char; 54 55 ConstLatin1Chars() = default; 56 ConstLatin1Chars(const Latin1Char* aChars, size_t aLength) 57 : Base(aChars, aLength) {} 58 }; 59 60 /* 61 * A Latin1Chars, but with \0 termination for C compatibility. 62 */ 63 class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> { 64 typedef mozilla::RangedPtr<Latin1Char> Base; 65 66 public: 67 using CharT = Latin1Char; 68 69 Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT 70 71 Latin1CharsZ(char* aBytes, size_t aLength) 72 : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) { 73 MOZ_ASSERT(aBytes[aLength] == '\0'); 74 } 75 76 Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) { 77 MOZ_ASSERT(aBytes[aLength] == '\0'); 78 } 79 80 using Base::operator=; 81 82 char* c_str() { return reinterpret_cast<char*>(get()); } 83 }; 84 85 class UTF8Chars : public mozilla::Range<unsigned char> { 86 typedef mozilla::Range<unsigned char> Base; 87 88 public: 89 using CharT = unsigned char; 90 91 UTF8Chars() = default; 92 UTF8Chars(char* aBytes, size_t aLength) 93 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {} 94 UTF8Chars(const char* aBytes, size_t aLength) 95 : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), 96 aLength) {} 97 UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength) 98 : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {} 99 UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength) 100 : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {} 101 }; 102 103 /* 104 * SpiderMonkey also deals directly with UTF-8 encoded text in some places. 105 */ 106 class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> { 107 typedef mozilla::RangedPtr<unsigned char> Base; 108 109 public: 110 using CharT = unsigned char; 111 112 UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT 113 114 UTF8CharsZ(char* aBytes, size_t aLength) 115 : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) { 116 MOZ_ASSERT(aBytes[aLength] == '\0'); 117 } 118 119 UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) { 120 MOZ_ASSERT(aBytes[aLength] == '\0'); 121 } 122 123 UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength) 124 : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {} 125 126 using Base::operator=; 127 128 char* c_str() { return reinterpret_cast<char*>(get()); } 129 }; 130 131 /* 132 * A wrapper for a "const char*" that is encoded using UTF-8. 133 * This class does not manage ownership of the data; that is left 134 * to others. This differs from UTF8CharsZ in that the chars are 135 * const and it disallows assignment. 136 */ 137 class JS_PUBLIC_API ConstUTF8CharsZ { 138 const char* data_; 139 140 public: 141 using CharT = unsigned char; 142 143 ConstUTF8CharsZ() : data_(nullptr) {} 144 145 explicit ConstUTF8CharsZ(const char* aBytes) : data_(aBytes) { 146 #ifdef DEBUG 147 if (aBytes) { 148 validateWithoutLength(); 149 } 150 #endif 151 } 152 153 ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) { 154 MOZ_ASSERT(aBytes[aLength] == '\0'); 155 #ifdef DEBUG 156 validate(aLength); 157 #endif 158 } 159 160 const void* get() const { return data_; } 161 162 const char* c_str() const { return data_; } 163 164 explicit operator bool() const { return data_ != nullptr; } 165 166 private: 167 #ifdef DEBUG 168 void validate(size_t aLength); 169 void validateWithoutLength(); 170 #endif 171 }; 172 173 /* 174 * SpiderMonkey uses a 2-byte character representation: it is a 175 * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, 176 * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a 177 * sufficiently dedicated JavaScript program to be fully unicode-aware by 178 * manually interpreting UTF-16 extension characters embedded in the JS 179 * string. 180 */ 181 class TwoByteChars : public mozilla::Range<char16_t> { 182 typedef mozilla::Range<char16_t> Base; 183 184 public: 185 using CharT = char16_t; 186 187 TwoByteChars() = default; 188 TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} 189 TwoByteChars(const char16_t* aChars, size_t aLength) 190 : Base(const_cast<char16_t*>(aChars), aLength) {} 191 }; 192 193 /* 194 * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. 195 */ 196 class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> { 197 typedef mozilla::RangedPtr<char16_t> Base; 198 199 public: 200 using CharT = char16_t; 201 202 TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT 203 204 TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) { 205 MOZ_ASSERT(chars[length] == '\0'); 206 } 207 208 using Base::operator=; 209 }; 210 211 typedef mozilla::RangedPtr<const char16_t> ConstCharPtr; 212 213 /* 214 * Like TwoByteChars, but the chars are const. 215 */ 216 class ConstTwoByteChars : public mozilla::Range<const char16_t> { 217 typedef mozilla::Range<const char16_t> Base; 218 219 public: 220 using CharT = char16_t; 221 222 ConstTwoByteChars() = default; 223 ConstTwoByteChars(const char16_t* aChars, size_t aLength) 224 : Base(aChars, aLength) {} 225 }; 226 227 /* 228 * Convert a 2-byte character sequence to "ISO-Latin-1". This works by 229 * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source 230 * contains any UTF-16 extension characters, then this may give invalid Latin1 231 * output. The returned string is zero terminated. The returned string or the 232 * returned string's |start()| must be freed with JS_free or js_free, 233 * respectively. If allocation fails, an OOM error will be set and the method 234 * will return a nullptr chars (which can be tested for with the ! operator). 235 * This method cannot trigger GC. 236 */ 237 extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ( 238 JSContext* cx, const mozilla::Range<const char16_t>& tbchars); 239 240 inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, 241 const char16_t* begin, 242 size_t length) { 243 const mozilla::Range<const char16_t> tbchars(begin, length); 244 return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars); 245 } 246 247 template <typename CharT, typename Allocator> 248 extern UTF8CharsZ CharsToNewUTF8CharsZ(Allocator* alloc, 249 const mozilla::Range<CharT>& chars); 250 251 JS_PUBLIC_API char32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, 252 int utf8Length); 253 254 /* 255 * Inflate bytes in UTF-8 encoding to char16_t. 256 * - On error, returns an empty TwoByteCharsZ. 257 * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold 258 * its length; the length value excludes the trailing null. 259 */ 260 extern JS_PUBLIC_API TwoByteCharsZ 261 UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8, 262 size_t* outlen, arena_id_t destArenaId); 263 264 /* 265 * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 266 * characters will be replaced by \uFFFD. No exception will be thrown for 267 * malformed UTF-8 input. 268 */ 269 extern JS_PUBLIC_API TwoByteCharsZ 270 LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8, 271 size_t* outlen, arena_id_t destArenaId); 272 273 /* 274 * Returns the length of the char buffer required to encode |s| as UTF8. 275 * Does not include the null-terminator. 276 */ 277 JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s); 278 279 /* 280 * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is 281 * exhausted or too little space is available in |dst| to fit the scalar 282 * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return 283 * the number of bytes of |dst| that were filled. 284 * 285 * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already 286 * linear. 287 * 288 * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|, 289 * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted 290 * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is 291 * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|. 292 * 293 * The exact space required is always |GetDeflatedUTF8StringLength(str)|. 294 */ 295 JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src, 296 mozilla::Span<char> dst); 297 298 /* 299 * The smallest character encoding capable of fully representing a particular 300 * string. 301 */ 302 enum class SmallestEncoding { ASCII, Latin1, UTF16 }; 303 304 /* 305 * Returns the smallest encoding possible for the given string: if all 306 * codepoints are <128 then ASCII, otherwise if all codepoints are <256 307 * Latin-1, else UTF16. 308 */ 309 JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(const UTF8Chars& utf8); 310 311 /* 312 * Return a null-terminated Latin-1 string copied from the input string, 313 * storing its length (excluding null terminator) in |*outlen|. Fail and 314 * report an error if the string contains non-Latin-1 codepoints. Returns 315 * Latin1CharsZ() on failure. 316 */ 317 extern JS_PUBLIC_API Latin1CharsZ 318 UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, size_t* outlen, 319 arena_id_t destArenaId); 320 321 /* 322 * Returns true if all characters in the given null-terminated string are 323 * ASCII, i.e. < 0x80, false otherwise. 324 */ 325 extern JS_PUBLIC_API bool StringIsASCII(const char* s); 326 327 /* 328 * Returns true if all characters in the given span are ASCII, 329 * i.e. < 0x80, false otherwise. 330 */ 331 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s); 332 333 /** 334 * Encode a narrow multibyte character string to a UTF-8 string. 335 * 336 * NOTE: Should only be used when interacting with POSIX/OS functions and not 337 * for encoding ASCII/Latin-1/etc. strings to UTF-8. 338 */ 339 extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx, 340 const char* chars); 341 342 /** 343 * Encode a wide string to a UTF-8 string. 344 * 345 * NOTE: Should only be used when interacting with Windows API functions. 346 */ 347 extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx, 348 const wchar_t* chars); 349 350 /** 351 * Encode a UTF-8 string to a narrow multibyte character string. 352 * 353 * NOTE: Should only be used when interacting with POSIX/OS functions and not 354 * for encoding UTF-8 to ASCII/Latin-1/etc. strings. 355 */ 356 extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx, 357 const char* chars); 358 359 /** 360 * Encode a UTF-8 string to a wide string. 361 * 362 * NOTE: Should only be used when interacting with Windows API functions. 363 */ 364 extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx, 365 const char* chars); 366 367 } // namespace JS 368 369 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } 370 inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); } 371 372 /** 373 * DEPRECATED 374 * 375 * Allocate memory sufficient to contain the characters of |str| truncated to 376 * Latin-1 and a trailing null terminator, fill the memory with the characters 377 * interpreted in that manner plus the null terminator, and return a pointer to 378 * the memory. 379 * 380 * This function *loses information* when it copies the characters of |str| if 381 * |str| contains code units greater than 0xFF. Additionally, users that 382 * depend on null-termination will misinterpret the copied characters if |str| 383 * contains any nulls. Avoid using this function if possible, because it will 384 * eventually be removed. 385 */ 386 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx, 387 JSString* str); 388 389 /** 390 * DEPRECATED 391 * 392 * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string. 393 * 394 * This function *loses information* when it copies the characters of |str| if 395 * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied 396 * instead. 397 * 398 * The returned string is also subject to misinterpretation if |str| contains 399 * any nulls (which are faithfully transcribed into the returned string, but 400 * which will implicitly truncate the string if it's passed to functions that 401 * expect null-terminated strings). 402 * 403 * Avoid using this function if possible, because we'll remove it once we can 404 * devise a better API for the task. 405 */ 406 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8( 407 JSContext* cx, JS::Handle<JSString*> str); 408 409 /** 410 * DEPRECATED 411 * 412 * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string. 413 * 414 * This function asserts in debug mode that the input string contains only 415 * ASCII characters. 416 * 417 * The returned string is also subject to misinterpretation if |str| contains 418 * any nulls (which are faithfully transcribed into the returned string, but 419 * which will implicitly truncate the string if it's passed to functions that 420 * expect null-terminated strings). 421 * 422 * Avoid using this function if possible, because we'll remove it once we can 423 * devise a better API for the task. 424 */ 425 extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx, 426 JSString* str); 427 428 #endif /* js_CharacterEncoding_h */