Encoding.h (57371B)
1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the 11 // "top-level directory" in the above notice refers to 12 // third_party/rust/encoding_c/. 13 14 #ifndef mozilla_Encoding_h 15 #define mozilla_Encoding_h 16 17 #include "mozilla/CheckedInt.h" 18 #include "mozilla/Maybe.h" 19 #include "mozilla/NotNull.h" 20 #include "mozilla/Span.h" 21 #include "nsString.h" 22 23 #include <tuple> 24 25 namespace mozilla { 26 class Encoding; 27 class Decoder; 28 class Encoder; 29 }; // namespace mozilla 30 31 #define ENCODING_RS_ENCODING mozilla::Encoding 32 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \ 33 mozilla::NotNull<const mozilla::Encoding*> 34 #define ENCODING_RS_ENCODER mozilla::Encoder 35 #define ENCODING_RS_DECODER mozilla::Decoder 36 37 #include "encoding_rs.h" 38 39 extern "C" { 40 41 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding, 42 uint8_t const* src, size_t src_len, 43 nsAString* dst); 44 45 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal( 46 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 47 nsAString* dst); 48 49 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling( 50 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 51 nsAString* dst); 52 53 nsresult 54 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( 55 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 56 nsAString* dst); 57 58 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding, 59 char16_t const* src, size_t src_len, 60 nsACString* dst); 61 62 nsresult mozilla_encoding_decode_to_nscstring( 63 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst); 64 65 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal( 66 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 67 68 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling( 69 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 70 71 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( 72 mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len, 73 nsACString* dst, size_t already_validated); 74 75 nsresult 76 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 77 mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst); 78 79 nsresult mozilla_encoding_encode_from_nscstring( 80 mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst); 81 82 } // extern "C" 83 84 namespace mozilla { 85 86 /** 87 * Return value from `Decoder`/`Encoder` to indicate that input 88 * was exhausted. 89 */ 90 const uint32_t kInputEmpty = INPUT_EMPTY; 91 92 /** 93 * Return value from `Decoder`/`Encoder` to indicate that output 94 * space was insufficient. 95 */ 96 const uint32_t kOutputFull = OUTPUT_FULL; 97 98 /** 99 * An encoding as defined in the Encoding Standard 100 * (https://encoding.spec.whatwg.org/). 101 * 102 * See https://docs.rs/encoding_rs/ for the Rust API docs. 103 * 104 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point 105 * sequence and, in most cases, vice versa. Each encoding has a name, an output 106 * encoding, and one or more labels. 107 * 108 * _Labels_ are ASCII-case-insensitive strings that are used to identify an 109 * encoding in formats and protocols. The _name_ of the encoding is the 110 * preferred label in the case appropriate for returning from the 111 * `characterSet` property of the `Document` DOM interface, except for 112 * the replacement encoding whose name is not one of its labels. 113 * 114 * The _output encoding_ is the encoding used for form submission and URL 115 * parsing on Web pages in the encoding. This is UTF-8 for the replacement, 116 * UTF-16LE and UTF-16BE encodings and the encoding itself for other 117 * encodings. 118 * 119 * # Streaming vs. Non-Streaming 120 * 121 * When you have the entire input in a single buffer, you can use the 122 * methods `Decode()`, `DecodeWithBOMRemoval()`, 123 * `DecodeWithoutBOMHandling()`, 124 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and 125 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and 126 * NewEncoder()` methods), these methods perform heap allocations. You should 127 * the `Decoder` and `Encoder` objects when your input is split into multiple 128 * buffers or when you want to control the allocation of the output buffers. 129 * 130 * # Instances 131 * 132 * All instances of `Encoding` are statically allocated and have the process's 133 * lifetime. There is precisely one unique `Encoding` instance for each 134 * encoding defined in the Encoding Standard. 135 * 136 * To obtain a reference to a particular encoding whose identity you know at 137 * compile time, use a `static` that refers to encoding. There is a `static` 138 * for each encoding. The `static`s are named in all caps with hyphens 139 * replaced with underscores and with `_ENCODING` appended to the 140 * name. For example, if you know at compile time that you will want to 141 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`. 142 * 143 * If you don't know what encoding you need at compile time and need to 144 * dynamically get an encoding by label, use `Encoding::for_label()`. 145 * 146 * Pointers to `Encoding` can be compared with `==` to check for the sameness 147 * of two encodings. 148 * 149 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer 150 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use 151 * `const mozilla::Encoding*` in the C signature and 152 * `*const encoding_rs::Encoding` is the corresponding Rust signature. 153 */ 154 class Encoding final { 155 public: 156 /** 157 * Implements the _get an encoding_ algorithm 158 * (https://encoding.spec.whatwg.org/#concept-encoding-get). 159 * 160 * If, after ASCII-lowercasing and removing leading and trailing 161 * whitespace, the argument matches a label defined in the Encoding 162 * Standard, `const Encoding*` representing the corresponding 163 * encoding is returned. If there is no match, `nullptr` is returned. 164 * 165 * This is the right method to use if the action upon the method returning 166 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`) 167 * instead. When the action upon the method returning `nullptr` is not to 168 * proceed with a fallback but to refuse processing, 169 * `ForLabelNoReplacement()` is more appropriate. 170 */ 171 static inline const Encoding* ForLabel(Span<const char> aLabel) { 172 return encoding_for_label( 173 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); 174 } 175 176 /** 177 * `nsAString` argument version. See above for docs. 178 */ 179 static inline const Encoding* ForLabel(const nsAString& aLabel) { 180 return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel)); 181 } 182 183 /** 184 * This method behaves the same as `ForLabel()`, except when `ForLabel()` 185 * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead. 186 * 187 * This method is useful in scenarios where a fatal error is required 188 * upon invalid label, because in those cases the caller typically wishes 189 * to treat the labels that map to the replacement encoding as fatal 190 * errors, too. 191 * 192 * It is not OK to use this method when the action upon the method returning 193 * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In 194 * such a case, the `ForLabel()` method should be used instead in order to 195 * avoid unsafe fallback for labels that `ForLabel()` maps to 196 * `REPLACEMENT_ENCODING`. 197 */ 198 static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) { 199 return encoding_for_label_no_replacement( 200 reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length()); 201 } 202 203 /** 204 * `nsAString` argument version. See above for docs. 205 */ 206 static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) { 207 return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel)); 208 } 209 210 /** 211 * Performs non-incremental BOM sniffing. 212 * 213 * The argument must either be a buffer representing the entire input 214 * stream (non-streaming case) or a buffer representing at least the first 215 * three bytes of the input stream (streaming case). 216 * 217 * Returns `{UTF_8_ENCODING, 3}`, 218 * `{UTF_16LE_ENCODING, 2}` or 219 * `{UTF_16BE_ENCODING, 3}` if the argument starts with the 220 * UTF-8, UTF-16LE or UTF-16BE BOM or `{nullptr, 0}` otherwise. 221 */ 222 static inline std::tuple<const Encoding*, size_t> ForBOM( 223 Span<const uint8_t> aBuffer) { 224 size_t len = aBuffer.Length(); 225 const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len); 226 return {encoding, len}; 227 } 228 229 /** 230 * Writes the name of this encoding into `aName`. 231 * 232 * This name is appropriate to return as-is from the DOM 233 * `document.characterSet` property. 234 */ 235 inline void Name(nsACString& aName) const { 236 aName.SetLength(ENCODING_NAME_MAX_LENGTH); 237 size_t length = 238 encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting())); 239 aName.SetLength(length); // truncation is the 64-bit case is OK 240 } 241 242 /** 243 * Checks whether the _output encoding_ of this encoding can encode every 244 * Unicode code point. (Only true if the output encoding is UTF-8.) 245 */ 246 inline bool CanEncodeEverything() const { 247 return encoding_can_encode_everything(this); 248 } 249 250 /** 251 * Checks whether this encoding maps one byte to one Basic Multilingual 252 * Plane code point (i.e. byte length equals decoded UTF-16 length) and 253 * vice versa (for mappable characters). 254 * 255 * `true` iff this encoding is on the list of Legacy single-byte 256 * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings) 257 * in the spec or x-user-defined. 258 */ 259 inline bool IsSingleByte() const { return encoding_is_single_byte(this); } 260 261 /** 262 * Checks whether the bytes 0x00...0x7F map exclusively to the characters 263 * U+0000...U+007F and vice versa. 264 */ 265 inline bool IsAsciiCompatible() const { 266 return encoding_is_ascii_compatible(this); 267 } 268 269 /** 270 * Checks whether this is a Japanese legacy encoding. 271 */ 272 inline bool IsJapaneseLegacy() const { 273 return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING || 274 this == ISO_2022_JP_ENCODING; 275 } 276 277 /** 278 * Returns the _output encoding_ of this encoding. This is UTF-8 for 279 * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise. 280 */ 281 inline NotNull<const mozilla::Encoding*> OutputEncoding() const { 282 return WrapNotNull(encoding_output_encoding(this)); 283 } 284 285 /** 286 * Decode complete input to `nsACString` _with BOM sniffing_ and with 287 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 288 * entire input is available as a single buffer (i.e. the end of the 289 * buffer marks the end of the stream). 290 * 291 * This method implements the (non-streaming version of) the 292 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 293 * 294 * The second item in the returned tuple is the encoding that was actually 295 * used (which may differ from this encoding thanks to BOM sniffing). 296 * 297 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 298 * if there were malformed sequences (that were replaced with the 299 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the 300 * tuple. 301 * 302 * The backing buffer of the string isn't copied if the input buffer 303 * is heap-allocated and decoding from UTF-8 and the input is valid 304 * BOMless UTF-8, decoding from an ASCII-compatible encoding and 305 * the input is valid ASCII or decoding from ISO-2022-JP and the 306 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass 307 * the same string as both arguments. 308 * 309 * _Note:_ It is wrong to use this when the input buffer represents only 310 * a segment of the input instead of the whole input. Use `NewDecoder()` 311 * when decoding segmented input. 312 */ 313 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( 314 const nsACString& aBytes, nsACString& aOut) const { 315 const Encoding* encoding = this; 316 const nsACString* bytes = &aBytes; 317 nsACString* out = &aOut; 318 nsresult rv; 319 if (bytes == out) { 320 nsAutoCString temp(aBytes); 321 rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out); 322 } else { 323 rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out); 324 } 325 return {rv, WrapNotNull(encoding)}; 326 } 327 328 /** 329 * Decode complete input to `nsAString` _with BOM sniffing_ and with 330 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 331 * entire input is available as a single buffer (i.e. the end of the 332 * buffer marks the end of the stream). 333 * 334 * This method implements the (non-streaming version of) the 335 * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept. 336 * 337 * The second item in the returned tuple is the encoding that was actually 338 * used (which may differ from this encoding thanks to BOM sniffing). 339 * 340 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 341 * if there were malformed sequences (that were replaced with the 342 * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the 343 * tuple. 344 * 345 * _Note:_ It is wrong to use this when the input buffer represents only 346 * a segment of the input instead of the whole input. Use `NewDecoder()` 347 * when decoding segmented input. 348 */ 349 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode( 350 Span<const uint8_t> aBytes, nsAString& aOut) const { 351 const Encoding* encoding = this; 352 nsresult rv = mozilla_encoding_decode_to_nsstring( 353 &encoding, aBytes.Elements(), aBytes.Length(), &aOut); 354 return {rv, WrapNotNull(encoding)}; 355 } 356 357 /** 358 * Decode complete input to `nsACString` _with BOM removal_ and with 359 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 360 * entire input is available as a single buffer (i.e. the end of the 361 * buffer marks the end of the stream). 362 * 363 * When invoked on `UTF_8`, this method implements the (non-streaming 364 * version of) the _UTF-8 decode_ 365 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 366 * 367 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 368 * if there were malformed sequences (that were replaced with the 369 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 370 * 371 * The backing buffer of the string isn't copied if the input buffer 372 * is heap-allocated and decoding from UTF-8 and the input is valid 373 * BOMless UTF-8, decoding from an ASCII-compatible encoding and 374 * the input is valid ASCII or decoding from ISO-2022-JP and the 375 * input stays in the ASCII state of ISO-2022-JP. It is OK to pass 376 * the same string as both arguments. 377 * 378 * _Note:_ It is wrong to use this when the input buffer represents only 379 * a segment of the input instead of the whole input. Use 380 * `NewDecoderWithBOMRemoval()` when decoding segmented input. 381 */ 382 inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes, 383 nsACString& aOut) const { 384 const nsACString* bytes = &aBytes; 385 nsACString* out = &aOut; 386 if (bytes == out) { 387 nsAutoCString temp(aBytes); 388 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp, 389 out); 390 } 391 return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes, 392 out); 393 } 394 395 /** 396 * Decode complete input to `nsAString` _with BOM removal_ and with 397 * malformed sequences replaced with the REPLACEMENT CHARACTER when the 398 * entire input is available as a single buffer (i.e. the end of the 399 * buffer marks the end of the stream). 400 * 401 * When invoked on `UTF_8`, this method implements the (non-streaming 402 * version of) the _UTF-8 decode_ 403 * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept. 404 * 405 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 406 * if there were malformed sequences (that were replaced with the 407 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 408 * 409 * _Note:_ It is wrong to use this when the input buffer represents only 410 * a segment of the input instead of the whole input. Use 411 * `NewDecoderWithBOMRemoval()` when decoding segmented input. 412 */ 413 inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes, 414 nsAString& aOut) const { 415 return mozilla_encoding_decode_to_nsstring_with_bom_removal( 416 this, aBytes.Elements(), aBytes.Length(), &aOut); 417 } 418 419 /** 420 * Decode complete input to `nsACString` _without BOM handling_ and 421 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 422 * the entire input is available as a single buffer (i.e. the end of the 423 * buffer marks the end of the stream). 424 * 425 * When invoked on `UTF_8`, this method implements the (non-streaming 426 * version of) the _UTF-8 decode without BOM_ 427 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 428 * 429 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 430 * if there were malformed sequences (that were replaced with the 431 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 432 * 433 * The backing buffer of the string isn't copied if the input buffer 434 * is heap-allocated and decoding from UTF-8 and the input is valid 435 * UTF-8, decoding from an ASCII-compatible encoding and the input 436 * is valid ASCII or decoding from ISO-2022-JP and the input stays 437 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 438 * as both arguments. 439 * 440 * _Note:_ It is wrong to use this when the input buffer represents only 441 * a segment of the input instead of the whole input. Use 442 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 443 */ 444 inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes, 445 nsACString& aOut) const { 446 const nsACString* bytes = &aBytes; 447 nsACString* out = &aOut; 448 if (bytes == out) { 449 nsAutoCString temp(aBytes); 450 return mozilla_encoding_decode_to_nscstring_without_bom_handling( 451 this, &temp, out); 452 } 453 return mozilla_encoding_decode_to_nscstring_without_bom_handling( 454 this, bytes, out); 455 } 456 457 /** 458 * Decode complete input to `nsAString` _without BOM handling_ and 459 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 460 * the entire input is available as a single buffer (i.e. the end of the 461 * buffer marks the end of the stream). 462 * 463 * When invoked on `UTF_8`, this method implements the (non-streaming 464 * version of) the _UTF-8 decode without BOM_ 465 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 466 * 467 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 468 * if there were malformed sequences (that were replaced with the 469 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 470 * 471 * _Note:_ It is wrong to use this when the input buffer represents only 472 * a segment of the input instead of the whole input. Use 473 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 474 */ 475 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, 476 nsAString& aOut) const { 477 return mozilla_encoding_decode_to_nsstring_without_bom_handling( 478 this, aBytes.Elements(), aBytes.Length(), &aOut); 479 } 480 481 /** 482 * Decode complete input to `nsACString` _without BOM handling_ and 483 * _with malformed sequences treated as fatal_ when the entire input is 484 * available as a single buffer (i.e. the end of the buffer marks the end 485 * of the stream). 486 * 487 * When invoked on `UTF_8`, this method implements the (non-streaming 488 * version of) the _UTF-8 decode without BOM or fail_ 489 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 490 * spec concept. 491 * 492 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` 493 * if a malformed sequence was encountered and `NS_OK` otherwise. 494 * 495 * The backing buffer of the string isn't copied if the input buffer 496 * is heap-allocated and decoding from UTF-8 and the input is valid 497 * UTF-8, decoding from an ASCII-compatible encoding and the input 498 * is valid ASCII or decoding from ISO-2022-JP and the input stays 499 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 500 * as both arguments. 501 * 502 * _Note:_ It is wrong to use this when the input buffer represents only 503 * a segment of the input instead of the whole input. Use 504 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 505 */ 506 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( 507 const nsACString& aBytes, nsACString& aOut) const { 508 const nsACString* bytes = &aBytes; 509 nsACString* out = &aOut; 510 if (bytes == out) { 511 nsAutoCString temp(aBytes); 512 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 513 this, &temp, out); 514 } 515 return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement( 516 this, bytes, out); 517 } 518 519 /** 520 * Decode complete input to `nsACString` _without BOM handling_ and 521 * with malformed sequences replaced with the REPLACEMENT CHARACTER when 522 * the entire input is available as a single buffer (i.e. the end of the 523 * buffer marks the end of the stream) _asserting that a number of bytes 524 * from the start are already known to be valid UTF-8_. 525 * 526 * The use case for this method is avoiding copying when dealing with 527 * input that has a UTF-8 BOM. _When in doubt, do not use this method._ 528 * 529 * When invoked on `UTF_8`, this method implements the (non-streaming 530 * version of) the _UTF-8 decode without BOM_ 531 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept. 532 * 533 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS` 534 * if there were malformed sequences (that were replaced with the 535 * REPLACEMENT CHARACTER) and `NS_OK` otherwise. 536 * 537 * _Note:_ It is wrong to use this when the input buffer represents only 538 * a segment of the input instead of the whole input. Use 539 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 540 * 541 * # Safety 542 * 543 * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8. 544 * `aBytes` _must not_ alias the buffer (if any) of `aOut`. 545 */ 546 inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes, 547 nsACString& aOut, 548 size_t aAlreadyValidated) const { 549 return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling( 550 this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated); 551 } 552 553 /** 554 * Decode complete input to `nsAString` _without BOM handling_ and 555 * _with malformed sequences treated as fatal_ when the entire input is 556 * available as a single buffer (i.e. the end of the buffer marks the end 557 * of the stream). 558 * 559 * When invoked on `UTF_8`, this method implements the (non-streaming 560 * version of) the _UTF-8 decode without BOM or fail_ 561 * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail) 562 * spec concept. 563 * 564 * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT` 565 * if a malformed sequence was encountered and `NS_OK` otherwise. 566 * 567 * _Note:_ It is wrong to use this when the input buffer represents only 568 * a segment of the input instead of the whole input. Use 569 * `NewDecoderWithoutBOMHandling()` when decoding segmented input. 570 */ 571 inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement( 572 Span<const uint8_t> aBytes, nsAString& aOut) const { 573 return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement( 574 this, aBytes.Elements(), aBytes.Length(), &aOut); 575 } 576 577 /** 578 * Encode complete input to `nsACString` with unmappable characters 579 * replaced with decimal numeric character references when the entire input 580 * is available as a single buffer (i.e. the end of the buffer marks the 581 * end of the stream). 582 * 583 * This method implements the (non-streaming version of) the 584 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 585 * 586 * The second item in the returned tuple is the encoding that was actually 587 * used (which may differ from this encoding thanks to some encodings 588 * having UTF-8 as their output encoding). 589 * 590 * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if 591 * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM, 592 * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were 593 * replaced with numeric character references) and `NS_OK` otherwise. 594 * 595 * The backing buffer of the string isn't copied if the input buffer 596 * is heap-allocated and encoding to UTF-8 and the input is valid 597 * UTF-8, encoding to an ASCII-compatible encoding and the input 598 * is valid ASCII or encoding from ISO-2022-JP and the input stays 599 * in the ASCII state of ISO-2022-JP. It is OK to pass the same string 600 * as both arguments. 601 * 602 * _Note:_ It is wrong to use this when the input buffer represents only 603 * a segment of the input instead of the whole input. Use `NewEncoder()` 604 * when encoding segmented output. 605 */ 606 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( 607 const nsACString& aString, nsACString& aOut) const { 608 const Encoding* encoding = this; 609 const nsACString* string = &aString; 610 nsACString* out = &aOut; 611 nsresult rv; 612 if (string == out) { 613 nsAutoCString temp(aString); 614 rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out); 615 } else { 616 rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out); 617 } 618 return {rv, WrapNotNull(encoding)}; 619 } 620 621 /** 622 * Encode complete input to `nsACString` with unmappable characters 623 * replaced with decimal numeric character references when the entire input 624 * is available as a single buffer (i.e. the end of the buffer marks the 625 * end of the stream). 626 * 627 * This method implements the (non-streaming version of) the 628 * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept. 629 * 630 * The second item in the returned tuple is the encoding that was actually 631 * used (which may differ from this encoding thanks to some encodings 632 * having UTF-8 as their output encoding). 633 * 634 * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon 635 * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that 636 * were replaced with numeric character references) and `NS_OK` otherwise. 637 638 * _Note:_ It is wrong to use this when the input buffer represents only 639 * a segment of the input instead of the whole input. Use `NewEncoder()` 640 * when encoding segmented output. 641 */ 642 inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode( 643 Span<const char16_t> aString, nsACString& aOut) const { 644 const Encoding* encoding = this; 645 nsresult rv = mozilla_encoding_encode_from_utf16( 646 &encoding, aString.Elements(), aString.Length(), &aOut); 647 return {rv, WrapNotNull(encoding)}; 648 } 649 650 /** 651 * Instantiates a new decoder for this encoding with BOM sniffing enabled. 652 * 653 * BOM sniffing may cause the returned decoder to morph into a decoder 654 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 655 */ 656 inline UniquePtr<Decoder> NewDecoder() const { 657 UniquePtr<Decoder> decoder(encoding_new_decoder(this)); 658 return decoder; 659 } 660 661 /** 662 * Instantiates a new decoder for this encoding with BOM sniffing enabled 663 * into memory occupied by a previously-instantiated decoder. 664 * 665 * BOM sniffing may cause the returned decoder to morph into a decoder 666 * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. 667 */ 668 inline void NewDecoderInto(Decoder& aDecoder) const { 669 encoding_new_decoder_into(this, &aDecoder); 670 } 671 672 /** 673 * Instantiates a new decoder for this encoding with BOM removal. 674 * 675 * If the input starts with bytes that are the BOM for this encoding, 676 * those bytes are removed. However, the decoder never morphs into a 677 * decoder for another encoding: A BOM for another encoding is treated as 678 * (potentially malformed) input to the decoding algorithm for this 679 * encoding. 680 */ 681 inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const { 682 UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this)); 683 return decoder; 684 } 685 686 /** 687 * Instantiates a new decoder for this encoding with BOM removal 688 * into memory occupied by a previously-instantiated decoder. 689 * 690 * If the input starts with bytes that are the BOM for this encoding, 691 * those bytes are removed. However, the decoder never morphs into a 692 * decoder for another encoding: A BOM for another encoding is treated as 693 * (potentially malformed) input to the decoding algorithm for this 694 * encoding. 695 */ 696 inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const { 697 encoding_new_decoder_with_bom_removal_into(this, &aDecoder); 698 } 699 700 /** 701 * Instantiates a new decoder for this encoding with BOM handling disabled. 702 * 703 * If the input starts with bytes that look like a BOM, those bytes are 704 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 705 * for another encoding.) 706 * 707 * _Note:_ If the caller has performed BOM sniffing on its own but has not 708 * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()` 709 * instead of this method to cause the BOM to be removed. 710 */ 711 inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const { 712 UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this)); 713 return decoder; 714 } 715 716 /** 717 * Instantiates a new decoder for this encoding with BOM handling disabled 718 * into memory occupied by a previously-instantiated decoder. 719 * 720 * If the input starts with bytes that look like a BOM, those bytes are 721 * not treated as a BOM. (Hence, the decoder never morphs into a decoder 722 * for another encoding.) 723 * 724 * _Note:_ If the caller has performed BOM sniffing on its own but has not 725 * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()` 726 * instead of this method to cause the BOM to be removed. 727 */ 728 inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const { 729 encoding_new_decoder_without_bom_handling_into(this, &aDecoder); 730 } 731 732 /** 733 * Instantiates a new encoder for the output encoding of this encoding. 734 */ 735 inline UniquePtr<Encoder> NewEncoder() const { 736 UniquePtr<Encoder> encoder(encoding_new_encoder(this)); 737 return encoder; 738 } 739 740 /** 741 * Instantiates a new encoder for the output encoding of this encoding 742 * into memory occupied by a previously-instantiated encoder. 743 */ 744 inline void NewEncoderInto(Encoder& aEncoder) const { 745 encoding_new_encoder_into(this, &aEncoder); 746 } 747 748 /** 749 * Validates UTF-8. 750 * 751 * Returns the index of the first byte that makes the input malformed as 752 * UTF-8 or the length of the input if the input is entirely valid. 753 */ 754 static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) { 755 return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length()); 756 } 757 758 /** 759 * Validates ASCII. 760 * 761 * Returns the index of the first byte that makes the input malformed as 762 * ASCII or the length of the input if the input is entirely valid. 763 */ 764 static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) { 765 return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length()); 766 } 767 768 /** 769 * Validates ISO-2022-JP ASCII-state data. 770 * 771 * Returns the index of the first byte that makes the input not 772 * representable in the ASCII state of ISO-2022-JP or the length of the 773 * input if the input is entirely representable in the ASCII state of 774 * ISO-2022-JP. 775 */ 776 static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) { 777 return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(), 778 aBuffer.Length()); 779 } 780 781 private: 782 Encoding() = delete; 783 Encoding(const Encoding&) = delete; 784 Encoding& operator=(const Encoding&) = delete; 785 ~Encoding() = delete; 786 }; 787 788 /** 789 * A converter that decodes a byte stream into Unicode according to a 790 * character encoding in a streaming (incremental) manner. 791 * 792 * The various `Decode*` methods take an input buffer (`aSrc`) and an output 793 * buffer `aDst` both of which are caller-allocated. There are variants for 794 * both UTF-8 and UTF-16 output buffers. 795 * 796 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored 797 * into `aDst` until one of the following three things happens: 798 * 799 * 1. A malformed byte sequence is encountered (`*WithoutReplacement` 800 * variants only). 801 * 802 * 2. The output buffer has been filled so near capacity that the decoder 803 * cannot be sure that processing an additional byte of input wouldn't 804 * cause so much output that the output buffer would overflow. 805 * 806 * 3. All the input bytes have been processed. 807 * 808 * The `Decode*` method then returns tuple of a status indicating which one 809 * of the three reasons to return happened, how many input bytes were read, 810 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t` 811 * when decoding to UTF-16) were written, and in the case of the 812 * variants performing replacement, a boolean indicating whether an error was 813 * replaced with the REPLACEMENT CHARACTER during the call. 814 * 815 * The number of bytes "written" is what's logically written. Garbage may be 816 * written in the output buffer beyond the point logically written to. 817 * 818 * In the case of the `*WithoutReplacement` variants, the status is a 819 * `uint32_t` whose possible values are packed info about a malformed byte 820 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases 821 * listed above). 822 * 823 * Packed info about malformed sequences has the following format: 824 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3, 825 * indicate the number of bytes that were consumed after the malformed 826 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate 827 * the length of the malformed byte sequence (possible decimal values 1, 2, 828 * 3 or 4). The maximum possible sum of the two is 6. 829 * 830 * In the case of methods whose name does not end with 831 * `*WithoutReplacement`, malformed sequences are automatically replaced 832 * with the REPLACEMENT CHARACTER and errors do not cause the methods to 833 * return early. 834 * 835 * When decoding to UTF-8, the output buffer must have at least 4 bytes of 836 * space. When decoding to UTF-16, the output buffer must have at least two 837 * UTF-16 code units (`char16_t`) of space. 838 * 839 * When decoding to UTF-8 without replacement, the methods are guaranteed 840 * not to return indicating that more output space is needed if the length 841 * of the output buffer is at least the length returned by 842 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8 843 * with replacement, the length of the output buffer that guarantees the 844 * methods not to return indicating that more output space is needed is given 845 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with 846 * or without replacement, the length of the output buffer that guarantees 847 * the methods not to return indicating that more output space is needed is 848 * given by `MaxUTF16BufferLength()`. 849 * 850 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16, 851 * and the output after each `Decode*` call is guaranteed to consist of 852 * complete characters. (I.e. the code unit sequence for the last character is 853 * guaranteed not to be split across output buffers.) 854 * 855 * The boolean argument `aLast` indicates that the end of the stream is reached 856 * when all the bytes in `aSrc` have been consumed. 857 * 858 * A `Decoder` object can be used to incrementally decode a byte stream. 859 * 860 * During the processing of a single stream, the caller must call `Decode*` 861 * zero or more times with `aLast` set to `false` and then call `Decode*` at 862 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`, 863 * the processing of the stream has ended. Otherwise, the caller must call 864 * `Decode*` again with `aLast` set to `true` (or treat a malformed result, 865 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). 866 * 867 * Once the stream has ended, the `Decoder` object must not be used anymore. 868 * That is, you need to create another one to process another stream. 869 * 870 * When the decoder returns `kOutputFull` or the decoder returns a malformed 871 * result and the caller does not wish to treat it as a fatal error, the input 872 * buffer `aSrc` may not have been completely consumed. In that case, the caller 873 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next 874 * call. 875 * 876 * # Infinite loops 877 * 878 * When converting with a fixed-size output buffer whose size is too small to 879 * accommodate one character of output, an infinite loop ensues. When 880 * converting with a fixed-size output buffer, it generally makes sense to 881 * make the buffer fairly large (e.g. couple of kilobytes). 882 */ 883 class Decoder final { 884 public: 885 ~Decoder() = default; 886 static void operator delete(void* aDecoder) { 887 decoder_free(reinterpret_cast<Decoder*>(aDecoder)); 888 } 889 890 /** 891 * The `Encoding` this `Decoder` is for. 892 * 893 * BOM sniffing can change the return value of this method during the life 894 * of the decoder. 895 */ 896 inline NotNull<const mozilla::Encoding*> Encoding() const { 897 return WrapNotNull(decoder_encoding(this)); 898 } 899 900 /** 901 * Query the worst-case UTF-8 output size _with replacement_. 902 * 903 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 904 * that will not overflow given the current state of the decoder and 905 * `aByteLength` number of additional input bytes when decoding with 906 * errors handled by outputting a REPLACEMENT CHARACTER for each malformed 907 * sequence. 908 */ 909 inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const { 910 CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength)); 911 if (max.value() == std::numeric_limits<size_t>::max()) { 912 // Mark invalid by overflowing 913 max++; 914 MOZ_ASSERT(!max.isValid()); 915 } 916 return max; 917 } 918 919 /** 920 * Query the worst-case UTF-8 output size _without replacement_. 921 * 922 * Returns the size of the output buffer in UTF-8 code units (`uint8_t`) 923 * that will not overflow given the current state of the decoder and 924 * `aByteLength` number of additional input bytes when decoding without 925 * replacement error handling. 926 * 927 * Note that this value may be too small for the `WithReplacement` case. 928 * Use `MaxUTF8BufferLength()` for that case. 929 */ 930 inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement( 931 size_t aByteLength) const { 932 CheckedInt<size_t> max( 933 decoder_max_utf8_buffer_length_without_replacement(this, aByteLength)); 934 if (max.value() == std::numeric_limits<size_t>::max()) { 935 // Mark invalid by overflowing 936 max++; 937 MOZ_ASSERT(!max.isValid()); 938 } 939 return max; 940 } 941 942 /** 943 * Incrementally decode a byte stream into UTF-8 with malformed sequences 944 * replaced with the REPLACEMENT CHARACTER. 945 * 946 * See the documentation of the class for documentation for `Decode*` 947 * methods collectively. 948 */ 949 inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8( 950 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 951 size_t srcRead = aSrc.Length(); 952 size_t dstWritten = aDst.Length(); 953 bool hadReplacements; 954 uint32_t result = 955 decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(), 956 &dstWritten, aLast, &hadReplacements); 957 return {result, srcRead, dstWritten, hadReplacements}; 958 } 959 960 /** 961 * Incrementally decode a byte stream into UTF-8 _without replacement_. 962 * 963 * See the documentation of the class for documentation for `Decode*` 964 * methods collectively. 965 */ 966 inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement( 967 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 968 size_t srcRead = aSrc.Length(); 969 size_t dstWritten = aDst.Length(); 970 uint32_t result = decoder_decode_to_utf8_without_replacement( 971 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 972 return {result, srcRead, dstWritten}; 973 } 974 975 /** 976 * Query the worst-case UTF-16 output size (with or without replacement). 977 * 978 * Returns the size of the output buffer in UTF-16 code units (`char16_t`) 979 * that will not overflow given the current state of the decoder and 980 * `aByteLength` number of additional input bytes. 981 * 982 * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the 983 * return value of this method applies also in the 984 * `_without_replacement` case. 985 */ 986 inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const { 987 CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length)); 988 if (max.value() == std::numeric_limits<size_t>::max()) { 989 // Mark invalid by overflowing 990 max++; 991 MOZ_ASSERT(!max.isValid()); 992 } 993 return max; 994 } 995 996 /** 997 * Incrementally decode a byte stream into UTF-16 with malformed sequences 998 * replaced with the REPLACEMENT CHARACTER. 999 * 1000 * See the documentation of the class for documentation for `Decode*` 1001 * methods collectively. 1002 */ 1003 inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16( 1004 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) { 1005 size_t srcRead = aSrc.Length(); 1006 size_t dstWritten = aDst.Length(); 1007 bool hadReplacements; 1008 uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead, 1009 aDst.Elements(), &dstWritten, 1010 aLast, &hadReplacements); 1011 return {result, srcRead, dstWritten, hadReplacements}; 1012 } 1013 1014 /** 1015 * Incrementally decode a byte stream into UTF-16 _without replacement_. 1016 * 1017 * See the documentation of the class for documentation for `Decode*` 1018 * methods collectively. 1019 */ 1020 inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement( 1021 Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) { 1022 size_t srcRead = aSrc.Length(); 1023 size_t dstWritten = aDst.Length(); 1024 uint32_t result = decoder_decode_to_utf16_without_replacement( 1025 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1026 return {result, srcRead, dstWritten}; 1027 } 1028 1029 /** 1030 * Checks for compatibility with storing Unicode scalar values as unsigned 1031 * bytes taking into account the state of the decoder. 1032 * 1033 * Returns `mozilla::Nothing()` if the decoder is not in a neutral state, 1034 * including waiting for the BOM, or if the encoding is never 1035 * Latin1-byte-compatible. 1036 * 1037 * Otherwise returns the index of the first byte whose unsigned value doesn't 1038 * directly correspond to the decoded Unicode scalar value, or the length 1039 * of the input if all bytes in the input decode directly to scalar values 1040 * corresponding to the unsigned byte values. 1041 * 1042 * Does not change the state of the decoder. 1043 * 1044 * Do not use this unless you are supporting SpiderMonkey-style string 1045 * storage optimizations. 1046 */ 1047 inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo( 1048 Span<const uint8_t> aBuffer) const { 1049 size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(), 1050 aBuffer.Length()); 1051 if (upTo == std::numeric_limits<size_t>::max()) { 1052 return mozilla::Nothing(); 1053 } 1054 return mozilla::Some(upTo); 1055 } 1056 1057 private: 1058 Decoder() = delete; 1059 Decoder(const Decoder&) = delete; 1060 Decoder& operator=(const Decoder&) = delete; 1061 }; 1062 1063 /** 1064 * A converter that encodes a Unicode stream into bytes according to a 1065 * character encoding in a streaming (incremental) manner. 1066 * 1067 * The various `Encode*` methods take an input buffer (`aSrc`) and an output 1068 * buffer `aDst` both of which are caller-allocated. There are variants for 1069 * both UTF-8 and UTF-16 input buffers. 1070 * 1071 * An `Encode*` method encode characters from `aSrc` into bytes characters 1072 * stored into `aDst` until one of the following three things happens: 1073 * 1074 * 1. An unmappable character is encountered (`*WithoutReplacement` variants 1075 * only). 1076 * 1077 * 2. The output buffer has been filled so near capacity that the decoder 1078 * cannot be sure that processing an additional character of input wouldn't 1079 * cause so much output that the output buffer would overflow. 1080 * 1081 * 3. All the input characters have been processed. 1082 * 1083 * The `Encode*` method then returns tuple of a status indicating which one 1084 * of the three reasons to return happened, how many input code units (`uint8_t` 1085 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read, 1086 * how many output bytes were written, and in the case of the variants that 1087 * perform replacement, a boolean indicating whether an unmappable 1088 * character was replaced with a numeric character reference during the call. 1089 * 1090 * The number of bytes "written" is what's logically written. Garbage may be 1091 * written in the output buffer beyond the point logically written to. 1092 * 1093 * In the case of the methods whose name ends with 1094 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values 1095 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding 1096 * to the three cases listed above). 1097 * 1098 * In the case of methods whose name does not end with 1099 * `*WithoutReplacement`, unmappable characters are automatically replaced 1100 * with the corresponding numeric character references and unmappable 1101 * characters do not cause the methods to return early. 1102 * 1103 * When encoding from UTF-8 without replacement, the methods are guaranteed 1104 * not to return indicating that more output space is needed if the length 1105 * of the output buffer is at least the length returned by 1106 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from 1107 * UTF-8 with replacement, the length of the output buffer that guarantees the 1108 * methods not to return indicating that more output space is needed in the 1109 * absence of unmappable characters is given by 1110 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from 1111 * UTF-16 without replacement, the methods are guaranteed not to return 1112 * indicating that more output space is needed if the length of the output 1113 * buffer is at least the length returned by 1114 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding 1115 * from UTF-16 with replacement, the the length of the output buffer that 1116 * guarantees the methods not to return indicating that more output space is 1117 * needed in the absence of unmappable characters is given by 1118 * `MaxBufferLengthFromUTF16IfNoUnmappables()`. 1119 * When encoding with replacement, applications are not expected to size the 1120 * buffer for the worst case ahead of time but to resize the buffer if there 1121 * are unmappable characters. This is why max length queries are only available 1122 * for the case where there are no unmappable characters. 1123 * 1124 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When 1125 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD 1126 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to 1127 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that 1128 * surrogate pairs are not split across input buffer boundaries. 1129 * 1130 * After an `Encode*` call returns, the output produced so far, taken as a 1131 * whole from the start of the stream, is guaranteed to consist of a valid 1132 * byte sequence in the target encoding. (I.e. the code unit sequence for a 1133 * character is guaranteed not to be split across output buffers. However, due 1134 * to the stateful nature of ISO-2022-JP, the stream needs to be considered 1135 * from the start for it to be valid. For other encodings, the validity holds 1136 * on a per-output buffer basis.) 1137 * 1138 * The boolean argument `aLast` indicates that the end of the stream is reached 1139 * when all the characters in `aSrc` have been consumed. This argument is needed 1140 * for ISO-2022-JP and is ignored for other encodings. 1141 * 1142 * An `Encoder` object can be used to incrementally encode a byte stream. 1143 * 1144 * During the processing of a single stream, the caller must call `Encode*` 1145 * zero or more times with `aLast` set to `false` and then call `Encode*` at 1146 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`, 1147 * the processing of the stream has ended. Otherwise, the caller must call 1148 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result, 1149 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error). 1150 * 1151 * Once the stream has ended, the `Encoder` object must not be used anymore. 1152 * That is, you need to create another one to process another stream. 1153 * 1154 * When the encoder returns `kOutputFull` or the encoder returns an unmappable 1155 * result and the caller does not wish to treat it as a fatal error, the input 1156 * buffer `aSrc` may not have been completely consumed. In that case, the caller 1157 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next 1158 * call. 1159 * 1160 * # Infinite loops 1161 * 1162 * When converting with a fixed-size output buffer whose size is too small to 1163 * accommodate one character of output, an infinite loop ensues. When 1164 * converting with a fixed-size output buffer, it generally makes sense to 1165 * make the buffer fairly large (e.g. couple of kilobytes). 1166 */ 1167 class Encoder final { 1168 public: 1169 ~Encoder() = default; 1170 1171 static void operator delete(void* aEncoder) { 1172 encoder_free(reinterpret_cast<Encoder*>(aEncoder)); 1173 } 1174 1175 /** 1176 * The `Encoding` this `Encoder` is for. 1177 */ 1178 inline NotNull<const mozilla::Encoding*> Encoding() const { 1179 return WrapNotNull(encoder_encoding(this)); 1180 } 1181 1182 /** 1183 * Returns `true` if this is an ISO-2022-JP encoder that's not in the 1184 * ASCII state and `false` otherwise. 1185 */ 1186 inline bool HasPendingState() const { 1187 return encoder_has_pending_state(this); 1188 } 1189 1190 /** 1191 * Query the worst-case output size when encoding from UTF-8 with 1192 * replacement. 1193 * 1194 * Returns the size of the output buffer in bytes that will not overflow 1195 * given the current state of the encoder and `aByteLength` number of 1196 * additional input code units if there are no unmappable characters in 1197 * the input. 1198 */ 1199 inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables( 1200 size_t aByteLength) const { 1201 CheckedInt<size_t> max( 1202 encoder_max_buffer_length_from_utf8_if_no_unmappables(this, 1203 aByteLength)); 1204 if (max.value() == std::numeric_limits<size_t>::max()) { 1205 // Mark invalid by overflowing 1206 max++; 1207 MOZ_ASSERT(!max.isValid()); 1208 } 1209 return max; 1210 } 1211 1212 /** 1213 * Query the worst-case output size when encoding from UTF-8 without 1214 * replacement. 1215 * 1216 * Returns the size of the output buffer in bytes that will not overflow 1217 * given the current state of the encoder and `aByteLength` number of 1218 * additional input code units. 1219 */ 1220 inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement( 1221 size_t aByteLength) const { 1222 CheckedInt<size_t> max( 1223 encoder_max_buffer_length_from_utf8_without_replacement(this, 1224 aByteLength)); 1225 if (max.value() == std::numeric_limits<size_t>::max()) { 1226 // Mark invalid by overflowing 1227 max++; 1228 MOZ_ASSERT(!max.isValid()); 1229 } 1230 return max; 1231 } 1232 1233 /** 1234 * Incrementally encode into byte stream from UTF-8 with unmappable 1235 * characters replaced with HTML (decimal) numeric character references. 1236 * 1237 * See the documentation of the class for documentation for `Encode*` 1238 * methods collectively. 1239 * 1240 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: 1241 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless 1242 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. 1243 */ 1244 inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8( 1245 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1246 size_t srcRead = aSrc.Length(); 1247 size_t dstWritten = aDst.Length(); 1248 bool hadReplacements; 1249 uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead, 1250 aDst.Elements(), &dstWritten, 1251 aLast, &hadReplacements); 1252 return {result, srcRead, dstWritten, hadReplacements}; 1253 } 1254 1255 /** 1256 * Incrementally encode into byte stream from UTF-8 _without replacement_. 1257 * 1258 * See the documentation of the class for documentation for `Encode*` 1259 * methods collectively. 1260 * 1261 * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING: 1262 * The input ***MUST*** be valid UTF-8 or bad things happen! Unless 1263 * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check. 1264 */ 1265 inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement( 1266 Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1267 size_t srcRead = aSrc.Length(); 1268 size_t dstWritten = aDst.Length(); 1269 uint32_t result = encoder_encode_from_utf8_without_replacement( 1270 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1271 return {result, srcRead, dstWritten}; 1272 } 1273 1274 /** 1275 * Query the worst-case output size when encoding from UTF-16 with 1276 * replacement. 1277 * 1278 * Returns the size of the output buffer in bytes that will not overflow 1279 * given the current state of the encoder and `aU16Length` number of 1280 * additional input code units if there are no unmappable characters in 1281 * the input. 1282 */ 1283 inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables( 1284 size_t aU16Length) const { 1285 CheckedInt<size_t> max( 1286 encoder_max_buffer_length_from_utf16_if_no_unmappables(this, 1287 aU16Length)); 1288 if (max.value() == std::numeric_limits<size_t>::max()) { 1289 // Mark invalid by overflowing 1290 max++; 1291 MOZ_ASSERT(!max.isValid()); 1292 } 1293 return max; 1294 } 1295 1296 /** 1297 * Query the worst-case output size when encoding from UTF-16 without 1298 * replacement. 1299 * 1300 * Returns the size of the output buffer in bytes that will not overflow 1301 * given the current state of the encoder and `aU16Length` number of 1302 * additional input code units. 1303 */ 1304 inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement( 1305 size_t aU16Length) const { 1306 CheckedInt<size_t> max( 1307 encoder_max_buffer_length_from_utf16_without_replacement(this, 1308 aU16Length)); 1309 if (max.value() == std::numeric_limits<size_t>::max()) { 1310 // Mark invalid by overflowing 1311 max++; 1312 MOZ_ASSERT(!max.isValid()); 1313 } 1314 return max; 1315 } 1316 1317 /** 1318 * Incrementally encode into byte stream from UTF-16 with unmappable 1319 * characters replaced with HTML (decimal) numeric character references. 1320 * 1321 * See the documentation of the class for documentation for `Encode*` 1322 * methods collectively. 1323 */ 1324 inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16( 1325 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1326 size_t srcRead = aSrc.Length(); 1327 size_t dstWritten = aDst.Length(); 1328 bool hadReplacements; 1329 uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead, 1330 aDst.Elements(), &dstWritten, 1331 aLast, &hadReplacements); 1332 return {result, srcRead, dstWritten, hadReplacements}; 1333 } 1334 1335 /** 1336 * Incrementally encode into byte stream from UTF-16 _without replacement_. 1337 * 1338 * See the documentation of the class for documentation for `Encode*` 1339 * methods collectively. 1340 */ 1341 inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement( 1342 Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) { 1343 size_t srcRead = aSrc.Length(); 1344 size_t dstWritten = aDst.Length(); 1345 uint32_t result = encoder_encode_from_utf16_without_replacement( 1346 this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast); 1347 return {result, srcRead, dstWritten}; 1348 } 1349 1350 private: 1351 Encoder() = delete; 1352 Encoder(const Encoder&) = delete; 1353 Encoder& operator=(const Encoder&) = delete; 1354 }; 1355 1356 }; // namespace mozilla 1357 1358 #endif // mozilla_Encoding_h