[ tor-browser ].git.dasho

Encoding.h (57371B)
      1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
      2 // file at the top-level directory of this distribution.
      3 //
      4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
      5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
      6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
      7 // option. This file may not be copied, modified, or distributed
      8 // except according to those terms.
      9 
     10 // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
     11 // "top-level directory" in the above notice refers to
     12 // third_party/rust/encoding_c/.
     13 
     14 #ifndef mozilla_Encoding_h
     15 #define mozilla_Encoding_h
     16 
     17 #include "mozilla/CheckedInt.h"
     18 #include "mozilla/Maybe.h"
     19 #include "mozilla/NotNull.h"
     20 #include "mozilla/Span.h"
     21 #include "nsString.h"
     22 
     23 #include <tuple>
     24 
     25 namespace mozilla {
     26 class Encoding;
     27 class Decoder;
     28 class Encoder;
     29 };  // namespace mozilla
     30 
     31 #define ENCODING_RS_ENCODING mozilla::Encoding
     32 #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
     33  mozilla::NotNull<const mozilla::Encoding*>
     34 #define ENCODING_RS_ENCODER mozilla::Encoder
     35 #define ENCODING_RS_DECODER mozilla::Decoder
     36 
     37 #include "encoding_rs.h"
     38 
     39 extern "C" {
     40 
     41 nsresult mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
     42                                             uint8_t const* src, size_t src_len,
     43                                             nsAString* dst);
     44 
     45 nsresult mozilla_encoding_decode_to_nsstring_with_bom_removal(
     46    mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
     47    nsAString* dst);
     48 
     49 nsresult mozilla_encoding_decode_to_nsstring_without_bom_handling(
     50    mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
     51    nsAString* dst);
     52 
     53 nsresult
     54 mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
     55    mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
     56    nsAString* dst);
     57 
     58 nsresult mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
     59                                            char16_t const* src, size_t src_len,
     60                                            nsACString* dst);
     61 
     62 nsresult mozilla_encoding_decode_to_nscstring(
     63    mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
     64 
     65 nsresult mozilla_encoding_decode_to_nscstring_with_bom_removal(
     66    mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
     67 
     68 nsresult mozilla_encoding_decode_to_nscstring_without_bom_handling(
     69    mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
     70 
     71 nsresult mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
     72    mozilla::Encoding const* encoding, uint8_t const* src, size_t src_len,
     73    nsACString* dst, size_t already_validated);
     74 
     75 nsresult
     76 mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
     77    mozilla::Encoding const* encoding, nsACString const* src, nsACString* dst);
     78 
     79 nsresult mozilla_encoding_encode_from_nscstring(
     80    mozilla::Encoding const** encoding, nsACString const* src, nsACString* dst);
     81 
     82 }  // extern "C"
     83 
     84 namespace mozilla {
     85 
     86 /**
     87 * Return value from `Decoder`/`Encoder` to indicate that input
     88 * was exhausted.
     89 */
     90 const uint32_t kInputEmpty = INPUT_EMPTY;
     91 
     92 /**
     93 * Return value from `Decoder`/`Encoder` to indicate that output
     94 * space was insufficient.
     95 */
     96 const uint32_t kOutputFull = OUTPUT_FULL;
     97 
     98 /**
     99 * An encoding as defined in the Encoding Standard
    100 * (https://encoding.spec.whatwg.org/).
    101 *
    102 * See https://docs.rs/encoding_rs/ for the Rust API docs.
    103 *
    104 * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
    105 * sequence and, in most cases, vice versa. Each encoding has a name, an output
    106 * encoding, and one or more labels.
    107 *
    108 * _Labels_ are ASCII-case-insensitive strings that are used to identify an
    109 * encoding in formats and protocols. The _name_ of the encoding is the
    110 * preferred label in the case appropriate for returning from the
    111 * `characterSet` property of the `Document` DOM interface, except for
    112 * the replacement encoding whose name is not one of its labels.
    113 *
    114 * The _output encoding_ is the encoding used for form submission and URL
    115 * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
    116 * UTF-16LE and UTF-16BE encodings and the encoding itself for other
    117 * encodings.
    118 *
    119 * # Streaming vs. Non-Streaming
    120 *
    121 * When you have the entire input in a single buffer, you can use the
    122 * methods `Decode()`, `DecodeWithBOMRemoval()`,
    123 * `DecodeWithoutBOMHandling()`,
    124 * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
    125 * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
    126 * NewEncoder()` methods), these methods perform heap allocations. You should
    127 * the `Decoder` and `Encoder` objects when your input is split into multiple
    128 * buffers or when you want to control the allocation of the output buffers.
    129 *
    130 * # Instances
    131 *
    132 * All instances of `Encoding` are statically allocated and have the process's
    133 * lifetime. There is precisely one unique `Encoding` instance for each
    134 * encoding defined in the Encoding Standard.
    135 *
    136 * To obtain a reference to a particular encoding whose identity you know at
    137 * compile time, use a `static` that refers to encoding. There is a `static`
    138 * for each encoding. The `static`s are named in all caps with hyphens
    139 * replaced with underscores and with `_ENCODING` appended to the
    140 * name. For example, if you know at compile time that you will want to
    141 * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
    142 *
    143 * If you don't know what encoding you need at compile time and need to
    144 * dynamically get an encoding by label, use `Encoding::for_label()`.
    145 *
    146 * Pointers to `Encoding` can be compared with `==` to check for the sameness
    147 * of two encodings.
    148 *
    149 * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
    150 * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
    151 * `const mozilla::Encoding*` in the C signature and
    152 * `*const encoding_rs::Encoding` is the corresponding Rust signature.
    153 */
    154 class Encoding final {
    155 public:
    156  /**
    157   * Implements the _get an encoding_ algorithm
    158   * (https://encoding.spec.whatwg.org/#concept-encoding-get).
    159   *
    160   * If, after ASCII-lowercasing and removing leading and trailing
    161   * whitespace, the argument matches a label defined in the Encoding
    162   * Standard, `const Encoding*` representing the corresponding
    163   * encoding is returned. If there is no match, `nullptr` is returned.
    164   *
    165   * This is the right method to use if the action upon the method returning
    166   * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
    167   * instead. When the action upon the method returning `nullptr` is not to
    168   * proceed with a fallback but to refuse processing,
    169   * `ForLabelNoReplacement()` is more appropriate.
    170   */
    171  static inline const Encoding* ForLabel(Span<const char> aLabel) {
    172    return encoding_for_label(
    173        reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
    174  }
    175 
    176  /**
    177   * `nsAString` argument version. See above for docs.
    178   */
    179  static inline const Encoding* ForLabel(const nsAString& aLabel) {
    180    return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
    181  }
    182 
    183  /**
    184   * This method behaves the same as `ForLabel()`, except when `ForLabel()`
    185   * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
    186   *
    187   * This method is useful in scenarios where a fatal error is required
    188   * upon invalid label, because in those cases the caller typically wishes
    189   * to treat the labels that map to the replacement encoding as fatal
    190   * errors, too.
    191   *
    192   * It is not OK to use this method when the action upon the method returning
    193   * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
    194   * such a case, the `ForLabel()` method should be used instead in order to
    195   * avoid unsafe fallback for labels that `ForLabel()` maps to
    196   * `REPLACEMENT_ENCODING`.
    197   */
    198  static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel) {
    199    return encoding_for_label_no_replacement(
    200        reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
    201  }
    202 
    203  /**
    204   * `nsAString` argument version. See above for docs.
    205   */
    206  static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel) {
    207    return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
    208  }
    209 
    210  /**
    211   * Performs non-incremental BOM sniffing.
    212   *
    213   * The argument must either be a buffer representing the entire input
    214   * stream (non-streaming case) or a buffer representing at least the first
    215   * three bytes of the input stream (streaming case).
    216   *
    217   * Returns `{UTF_8_ENCODING, 3}`,
    218   * `{UTF_16LE_ENCODING, 2}` or
    219   * `{UTF_16BE_ENCODING, 3}` if the argument starts with the
    220   * UTF-8, UTF-16LE or UTF-16BE BOM or `{nullptr, 0}` otherwise.
    221   */
    222  static inline std::tuple<const Encoding*, size_t> ForBOM(
    223      Span<const uint8_t> aBuffer) {
    224    size_t len = aBuffer.Length();
    225    const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
    226    return {encoding, len};
    227  }
    228 
    229  /**
    230   * Writes the name of this encoding into `aName`.
    231   *
    232   * This name is appropriate to return as-is from the DOM
    233   * `document.characterSet` property.
    234   */
    235  inline void Name(nsACString& aName) const {
    236    aName.SetLength(ENCODING_NAME_MAX_LENGTH);
    237    size_t length =
    238        encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
    239    aName.SetLength(length);  // truncation is the 64-bit case is OK
    240  }
    241 
    242  /**
    243   * Checks whether the _output encoding_ of this encoding can encode every
    244   * Unicode code point. (Only true if the output encoding is UTF-8.)
    245   */
    246  inline bool CanEncodeEverything() const {
    247    return encoding_can_encode_everything(this);
    248  }
    249 
    250  /**
    251   * Checks whether this encoding maps one byte to one Basic Multilingual
    252   * Plane code point (i.e. byte length equals decoded UTF-16 length) and
    253   * vice versa (for mappable characters).
    254   *
    255   * `true` iff this encoding is on the list of Legacy single-byte
    256   * encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
    257   * in the spec or x-user-defined.
    258   */
    259  inline bool IsSingleByte() const { return encoding_is_single_byte(this); }
    260 
    261  /**
    262   * Checks whether the bytes 0x00...0x7F map exclusively to the characters
    263   * U+0000...U+007F and vice versa.
    264   */
    265  inline bool IsAsciiCompatible() const {
    266    return encoding_is_ascii_compatible(this);
    267  }
    268 
    269  /**
    270   * Checks whether this is a Japanese legacy encoding.
    271   */
    272  inline bool IsJapaneseLegacy() const {
    273    return this == SHIFT_JIS_ENCODING || this == EUC_JP_ENCODING ||
    274           this == ISO_2022_JP_ENCODING;
    275  }
    276 
    277  /**
    278   * Returns the _output encoding_ of this encoding. This is UTF-8 for
    279   * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
    280   */
    281  inline NotNull<const mozilla::Encoding*> OutputEncoding() const {
    282    return WrapNotNull(encoding_output_encoding(this));
    283  }
    284 
    285  /**
    286   * Decode complete input to `nsACString` _with BOM sniffing_ and with
    287   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
    288   * entire input is available as a single buffer (i.e. the end of the
    289   * buffer marks the end of the stream).
    290   *
    291   * This method implements the (non-streaming version of) the
    292   * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
    293   *
    294   * The second item in the returned tuple is the encoding that was actually
    295   * used (which may differ from this encoding thanks to BOM sniffing).
    296   *
    297   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    298   * if there were malformed sequences (that were replaced with the
    299   * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
    300   * tuple.
    301   *
    302   * The backing buffer of the string isn't copied if the input buffer
    303   * is heap-allocated and decoding from UTF-8 and the input is valid
    304   * BOMless UTF-8, decoding from an ASCII-compatible encoding and
    305   * the input is valid ASCII or decoding from ISO-2022-JP and the
    306   * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
    307   * the same string as both arguments.
    308   *
    309   * _Note:_ It is wrong to use this when the input buffer represents only
    310   * a segment of the input instead of the whole input. Use `NewDecoder()`
    311   * when decoding segmented input.
    312   */
    313  inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
    314      const nsACString& aBytes, nsACString& aOut) const {
    315    const Encoding* encoding = this;
    316    const nsACString* bytes = &aBytes;
    317    nsACString* out = &aOut;
    318    nsresult rv;
    319    if (bytes == out) {
    320      nsAutoCString temp(aBytes);
    321      rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
    322    } else {
    323      rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
    324    }
    325    return {rv, WrapNotNull(encoding)};
    326  }
    327 
    328  /**
    329   * Decode complete input to `nsAString` _with BOM sniffing_ and with
    330   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
    331   * entire input is available as a single buffer (i.e. the end of the
    332   * buffer marks the end of the stream).
    333   *
    334   * This method implements the (non-streaming version of) the
    335   * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
    336   *
    337   * The second item in the returned tuple is the encoding that was actually
    338   * used (which may differ from this encoding thanks to BOM sniffing).
    339   *
    340   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    341   * if there were malformed sequences (that were replaced with the
    342   * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
    343   * tuple.
    344   *
    345   * _Note:_ It is wrong to use this when the input buffer represents only
    346   * a segment of the input instead of the whole input. Use `NewDecoder()`
    347   * when decoding segmented input.
    348   */
    349  inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
    350      Span<const uint8_t> aBytes, nsAString& aOut) const {
    351    const Encoding* encoding = this;
    352    nsresult rv = mozilla_encoding_decode_to_nsstring(
    353        &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
    354    return {rv, WrapNotNull(encoding)};
    355  }
    356 
    357  /**
    358   * Decode complete input to `nsACString` _with BOM removal_ and with
    359   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
    360   * entire input is available as a single buffer (i.e. the end of the
    361   * buffer marks the end of the stream).
    362   *
    363   * When invoked on `UTF_8`, this method implements the (non-streaming
    364   * version of) the _UTF-8 decode_
    365   * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
    366   *
    367   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    368   * if there were malformed sequences (that were replaced with the
    369   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
    370   *
    371   * The backing buffer of the string isn't copied if the input buffer
    372   * is heap-allocated and decoding from UTF-8 and the input is valid
    373   * BOMless UTF-8, decoding from an ASCII-compatible encoding and
    374   * the input is valid ASCII or decoding from ISO-2022-JP and the
    375   * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
    376   * the same string as both arguments.
    377   *
    378   * _Note:_ It is wrong to use this when the input buffer represents only
    379   * a segment of the input instead of the whole input. Use
    380   * `NewDecoderWithBOMRemoval()` when decoding segmented input.
    381   */
    382  inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
    383                                       nsACString& aOut) const {
    384    const nsACString* bytes = &aBytes;
    385    nsACString* out = &aOut;
    386    if (bytes == out) {
    387      nsAutoCString temp(aBytes);
    388      return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, &temp,
    389                                                                   out);
    390    }
    391    return mozilla_encoding_decode_to_nscstring_with_bom_removal(this, bytes,
    392                                                                 out);
    393  }
    394 
    395  /**
    396   * Decode complete input to `nsAString` _with BOM removal_ and with
    397   * malformed sequences replaced with the REPLACEMENT CHARACTER when the
    398   * entire input is available as a single buffer (i.e. the end of the
    399   * buffer marks the end of the stream).
    400   *
    401   * When invoked on `UTF_8`, this method implements the (non-streaming
    402   * version of) the _UTF-8 decode_
    403   * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
    404   *
    405   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    406   * if there were malformed sequences (that were replaced with the
    407   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
    408   *
    409   * _Note:_ It is wrong to use this when the input buffer represents only
    410   * a segment of the input instead of the whole input. Use
    411   * `NewDecoderWithBOMRemoval()` when decoding segmented input.
    412   */
    413  inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
    414                                       nsAString& aOut) const {
    415    return mozilla_encoding_decode_to_nsstring_with_bom_removal(
    416        this, aBytes.Elements(), aBytes.Length(), &aOut);
    417  }
    418 
    419  /**
    420   * Decode complete input to `nsACString` _without BOM handling_ and
    421   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
    422   * the entire input is available as a single buffer (i.e. the end of the
    423   * buffer marks the end of the stream).
    424   *
    425   * When invoked on `UTF_8`, this method implements the (non-streaming
    426   * version of) the _UTF-8 decode without BOM_
    427   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
    428   *
    429   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    430   * if there were malformed sequences (that were replaced with the
    431   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
    432   *
    433   * The backing buffer of the string isn't copied if the input buffer
    434   * is heap-allocated and decoding from UTF-8 and the input is valid
    435   * UTF-8, decoding from an ASCII-compatible encoding and the input
    436   * is valid ASCII or decoding from ISO-2022-JP and the input stays
    437   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
    438   * as both arguments.
    439   *
    440   * _Note:_ It is wrong to use this when the input buffer represents only
    441   * a segment of the input instead of the whole input. Use
    442   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
    443   */
    444  inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
    445                                           nsACString& aOut) const {
    446    const nsACString* bytes = &aBytes;
    447    nsACString* out = &aOut;
    448    if (bytes == out) {
    449      nsAutoCString temp(aBytes);
    450      return mozilla_encoding_decode_to_nscstring_without_bom_handling(
    451          this, &temp, out);
    452    }
    453    return mozilla_encoding_decode_to_nscstring_without_bom_handling(
    454        this, bytes, out);
    455  }
    456 
    457  /**
    458   * Decode complete input to `nsAString` _without BOM handling_ and
    459   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
    460   * the entire input is available as a single buffer (i.e. the end of the
    461   * buffer marks the end of the stream).
    462   *
    463   * When invoked on `UTF_8`, this method implements the (non-streaming
    464   * version of) the _UTF-8 decode without BOM_
    465   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
    466   *
    467   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    468   * if there were malformed sequences (that were replaced with the
    469   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
    470   *
    471   * _Note:_ It is wrong to use this when the input buffer represents only
    472   * a segment of the input instead of the whole input. Use
    473   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
    474   */
    475  inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
    476                                           nsAString& aOut) const {
    477    return mozilla_encoding_decode_to_nsstring_without_bom_handling(
    478        this, aBytes.Elements(), aBytes.Length(), &aOut);
    479  }
    480 
    481  /**
    482   * Decode complete input to `nsACString` _without BOM handling_ and
    483   * _with malformed sequences treated as fatal_ when the entire input is
    484   * available as a single buffer (i.e. the end of the buffer marks the end
    485   * of the stream).
    486   *
    487   * When invoked on `UTF_8`, this method implements the (non-streaming
    488   * version of) the _UTF-8 decode without BOM or fail_
    489   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
    490   * spec concept.
    491   *
    492   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
    493   * if a malformed sequence was encountered and `NS_OK` otherwise.
    494   *
    495   * The backing buffer of the string isn't copied if the input buffer
    496   * is heap-allocated and decoding from UTF-8 and the input is valid
    497   * UTF-8, decoding from an ASCII-compatible encoding and the input
    498   * is valid ASCII or decoding from ISO-2022-JP and the input stays
    499   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
    500   * as both arguments.
    501   *
    502   * _Note:_ It is wrong to use this when the input buffer represents only
    503   * a segment of the input instead of the whole input. Use
    504   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
    505   */
    506  inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
    507      const nsACString& aBytes, nsACString& aOut) const {
    508    const nsACString* bytes = &aBytes;
    509    nsACString* out = &aOut;
    510    if (bytes == out) {
    511      nsAutoCString temp(aBytes);
    512      return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
    513          this, &temp, out);
    514    }
    515    return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
    516        this, bytes, out);
    517  }
    518 
    519  /**
    520   * Decode complete input to `nsACString` _without BOM handling_ and
    521   * with malformed sequences replaced with the REPLACEMENT CHARACTER when
    522   * the entire input is available as a single buffer (i.e. the end of the
    523   * buffer marks the end of the stream) _asserting that a number of bytes
    524   * from the start are already known to be valid UTF-8_.
    525   *
    526   * The use case for this method is avoiding copying when dealing with
    527   * input that has a UTF-8 BOM. _When in doubt, do not use this method._
    528   *
    529   * When invoked on `UTF_8`, this method implements the (non-streaming
    530   * version of) the _UTF-8 decode without BOM_
    531   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
    532   *
    533   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
    534   * if there were malformed sequences (that were replaced with the
    535   * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
    536   *
    537   * _Note:_ It is wrong to use this when the input buffer represents only
    538   * a segment of the input instead of the whole input. Use
    539   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
    540   *
    541   * # Safety
    542   *
    543   * The first `aAlreadyValidated` bytes of `aBytes` _must_ be valid UTF-8.
    544   * `aBytes` _must not_ alias the buffer (if any) of `aOut`.
    545   */
    546  inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
    547                                           nsACString& aOut,
    548                                           size_t aAlreadyValidated) const {
    549    return mozilla_encoding_decode_from_slice_to_nscstring_without_bom_handling(
    550        this, aBytes.Elements(), aBytes.Length(), &aOut, aAlreadyValidated);
    551  }
    552 
    553  /**
    554   * Decode complete input to `nsAString` _without BOM handling_ and
    555   * _with malformed sequences treated as fatal_ when the entire input is
    556   * available as a single buffer (i.e. the end of the buffer marks the end
    557   * of the stream).
    558   *
    559   * When invoked on `UTF_8`, this method implements the (non-streaming
    560   * version of) the _UTF-8 decode without BOM or fail_
    561   * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
    562   * spec concept.
    563   *
    564   * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
    565   * if a malformed sequence was encountered and `NS_OK` otherwise.
    566   *
    567   * _Note:_ It is wrong to use this when the input buffer represents only
    568   * a segment of the input instead of the whole input. Use
    569   * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
    570   */
    571  inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
    572      Span<const uint8_t> aBytes, nsAString& aOut) const {
    573    return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
    574        this, aBytes.Elements(), aBytes.Length(), &aOut);
    575  }
    576 
    577  /**
    578   * Encode complete input to `nsACString` with unmappable characters
    579   * replaced with decimal numeric character references when the entire input
    580   * is available as a single buffer (i.e. the end of the buffer marks the
    581   * end of the stream).
    582   *
    583   * This method implements the (non-streaming version of) the
    584   * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
    585   *
    586   * The second item in the returned tuple is the encoding that was actually
    587   * used (which may differ from this encoding thanks to some encodings
    588   * having UTF-8 as their output encoding).
    589   *
    590   * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
    591   * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
    592   * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
    593   * replaced with numeric character references) and `NS_OK` otherwise.
    594   *
    595   * The backing buffer of the string isn't copied if the input buffer
    596   * is heap-allocated and encoding to UTF-8 and the input is valid
    597   * UTF-8, encoding to an ASCII-compatible encoding and the input
    598   * is valid ASCII or encoding from ISO-2022-JP and the input stays
    599   * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
    600   * as both arguments.
    601   *
    602   * _Note:_ It is wrong to use this when the input buffer represents only
    603   * a segment of the input instead of the whole input. Use `NewEncoder()`
    604   * when encoding segmented output.
    605   */
    606  inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
    607      const nsACString& aString, nsACString& aOut) const {
    608    const Encoding* encoding = this;
    609    const nsACString* string = &aString;
    610    nsACString* out = &aOut;
    611    nsresult rv;
    612    if (string == out) {
    613      nsAutoCString temp(aString);
    614      rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
    615    } else {
    616      rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
    617    }
    618    return {rv, WrapNotNull(encoding)};
    619  }
    620 
    621  /**
    622   * Encode complete input to `nsACString` with unmappable characters
    623   * replaced with decimal numeric character references when the entire input
    624   * is available as a single buffer (i.e. the end of the buffer marks the
    625   * end of the stream).
    626   *
    627   * This method implements the (non-streaming version of) the
    628   * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
    629   *
    630   * The second item in the returned tuple is the encoding that was actually
    631   * used (which may differ from this encoding thanks to some encodings
    632   * having UTF-8 as their output encoding).
    633   *
    634   * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
    635   * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
    636   * were replaced with numeric character references) and `NS_OK` otherwise.
    637 
    638   * _Note:_ It is wrong to use this when the input buffer represents only
    639   * a segment of the input instead of the whole input. Use `NewEncoder()`
    640   * when encoding segmented output.
    641   */
    642  inline std::tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
    643      Span<const char16_t> aString, nsACString& aOut) const {
    644    const Encoding* encoding = this;
    645    nsresult rv = mozilla_encoding_encode_from_utf16(
    646        &encoding, aString.Elements(), aString.Length(), &aOut);
    647    return {rv, WrapNotNull(encoding)};
    648  }
    649 
    650  /**
    651   * Instantiates a new decoder for this encoding with BOM sniffing enabled.
    652   *
    653   * BOM sniffing may cause the returned decoder to morph into a decoder
    654   * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
    655   */
    656  inline UniquePtr<Decoder> NewDecoder() const {
    657    UniquePtr<Decoder> decoder(encoding_new_decoder(this));
    658    return decoder;
    659  }
    660 
    661  /**
    662   * Instantiates a new decoder for this encoding with BOM sniffing enabled
    663   * into memory occupied by a previously-instantiated decoder.
    664   *
    665   * BOM sniffing may cause the returned decoder to morph into a decoder
    666   * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
    667   */
    668  inline void NewDecoderInto(Decoder& aDecoder) const {
    669    encoding_new_decoder_into(this, &aDecoder);
    670  }
    671 
    672  /**
    673   * Instantiates a new decoder for this encoding with BOM removal.
    674   *
    675   * If the input starts with bytes that are the BOM for this encoding,
    676   * those bytes are removed. However, the decoder never morphs into a
    677   * decoder for another encoding: A BOM for another encoding is treated as
    678   * (potentially malformed) input to the decoding algorithm for this
    679   * encoding.
    680   */
    681  inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const {
    682    UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
    683    return decoder;
    684  }
    685 
    686  /**
    687   * Instantiates a new decoder for this encoding with BOM removal
    688   * into memory occupied by a previously-instantiated decoder.
    689   *
    690   * If the input starts with bytes that are the BOM for this encoding,
    691   * those bytes are removed. However, the decoder never morphs into a
    692   * decoder for another encoding: A BOM for another encoding is treated as
    693   * (potentially malformed) input to the decoding algorithm for this
    694   * encoding.
    695   */
    696  inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const {
    697    encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
    698  }
    699 
    700  /**
    701   * Instantiates a new decoder for this encoding with BOM handling disabled.
    702   *
    703   * If the input starts with bytes that look like a BOM, those bytes are
    704   * not treated as a BOM. (Hence, the decoder never morphs into a decoder
    705   * for another encoding.)
    706   *
    707   * _Note:_ If the caller has performed BOM sniffing on its own but has not
    708   * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
    709   * instead of this method to cause the BOM to be removed.
    710   */
    711  inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const {
    712    UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
    713    return decoder;
    714  }
    715 
    716  /**
    717   * Instantiates a new decoder for this encoding with BOM handling disabled
    718   * into memory occupied by a previously-instantiated decoder.
    719   *
    720   * If the input starts with bytes that look like a BOM, those bytes are
    721   * not treated as a BOM. (Hence, the decoder never morphs into a decoder
    722   * for another encoding.)
    723   *
    724   * _Note:_ If the caller has performed BOM sniffing on its own but has not
    725   * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
    726   * instead of this method to cause the BOM to be removed.
    727   */
    728  inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const {
    729    encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
    730  }
    731 
    732  /**
    733   * Instantiates a new encoder for the output encoding of this encoding.
    734   */
    735  inline UniquePtr<Encoder> NewEncoder() const {
    736    UniquePtr<Encoder> encoder(encoding_new_encoder(this));
    737    return encoder;
    738  }
    739 
    740  /**
    741   * Instantiates a new encoder for the output encoding of this encoding
    742   * into memory occupied by a previously-instantiated encoder.
    743   */
    744  inline void NewEncoderInto(Encoder& aEncoder) const {
    745    encoding_new_encoder_into(this, &aEncoder);
    746  }
    747 
    748  /**
    749   * Validates UTF-8.
    750   *
    751   * Returns the index of the first byte that makes the input malformed as
    752   * UTF-8 or the length of the input if the input is entirely valid.
    753   */
    754  static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer) {
    755    return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
    756  }
    757 
    758  /**
    759   * Validates ASCII.
    760   *
    761   * Returns the index of the first byte that makes the input malformed as
    762   * ASCII or the length of the input if the input is entirely valid.
    763   */
    764  static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer) {
    765    return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
    766  }
    767 
    768  /**
    769   * Validates ISO-2022-JP ASCII-state data.
    770   *
    771   * Returns the index of the first byte that makes the input not
    772   * representable in the ASCII state of ISO-2022-JP or the length of the
    773   * input if the input is entirely representable in the ASCII state of
    774   * ISO-2022-JP.
    775   */
    776  static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer) {
    777    return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
    778                                                  aBuffer.Length());
    779  }
    780 
    781 private:
    782  Encoding() = delete;
    783  Encoding(const Encoding&) = delete;
    784  Encoding& operator=(const Encoding&) = delete;
    785  ~Encoding() = delete;
    786 };
    787 
    788 /**
    789 * A converter that decodes a byte stream into Unicode according to a
    790 * character encoding in a streaming (incremental) manner.
    791 *
    792 * The various `Decode*` methods take an input buffer (`aSrc`) and an output
    793 * buffer `aDst` both of which are caller-allocated. There are variants for
    794 * both UTF-8 and UTF-16 output buffers.
    795 *
    796 * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
    797 * into `aDst` until one of the following three things happens:
    798 *
    799 * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
    800 *    variants only).
    801 *
    802 * 2. The output buffer has been filled so near capacity that the decoder
    803 *    cannot be sure that processing an additional byte of input wouldn't
    804 *    cause so much output that the output buffer would overflow.
    805 *
    806 * 3. All the input bytes have been processed.
    807 *
    808 * The `Decode*` method then returns tuple of a status indicating which one
    809 * of the three reasons to return happened, how many input bytes were read,
    810 * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
    811 * when decoding to UTF-16) were written, and in the case of the
    812 * variants performing replacement, a boolean indicating whether an error was
    813 * replaced with the REPLACEMENT CHARACTER during the call.
    814 *
    815 * The number of bytes "written" is what's logically written. Garbage may be
    816 * written in the output buffer beyond the point logically written to.
    817 *
    818 * In the case of the `*WithoutReplacement` variants, the status is a
    819 * `uint32_t` whose possible values are packed info about a malformed byte
    820 * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
    821 * listed above).
    822 *
    823 * Packed info about malformed sequences has the following format:
    824 * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
    825 * indicate the number of bytes that were consumed after the malformed
    826 * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
    827 * the length of the malformed byte sequence (possible decimal values 1, 2,
    828 * 3 or 4). The maximum possible sum of the two is 6.
    829 *
    830 * In the case of methods whose name does not end with
    831 * `*WithoutReplacement`, malformed sequences are automatically replaced
    832 * with the REPLACEMENT CHARACTER and errors do not cause the methods to
    833 * return early.
    834 *
    835 * When decoding to UTF-8, the output buffer must have at least 4 bytes of
    836 * space. When decoding to UTF-16, the output buffer must have at least two
    837 * UTF-16 code units (`char16_t`) of space.
    838 *
    839 * When decoding to UTF-8 without replacement, the methods are guaranteed
    840 * not to return indicating that more output space is needed if the length
    841 * of the output buffer is at least the length returned by
    842 * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
    843 * with replacement, the length of the output buffer that guarantees the
    844 * methods not to return indicating that more output space is needed is given
    845 * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
    846 * or without replacement, the length of the output buffer that guarantees
    847 * the methods not to return indicating that more output space is needed is
    848 * given by `MaxUTF16BufferLength()`.
    849 *
    850 * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
    851 * and the output after each `Decode*` call is guaranteed to consist of
    852 * complete characters. (I.e. the code unit sequence for the last character is
    853 * guaranteed not to be split across output buffers.)
    854 *
    855 * The boolean argument `aLast` indicates that the end of the stream is reached
    856 * when all the bytes in `aSrc` have been consumed.
    857 *
    858 * A `Decoder` object can be used to incrementally decode a byte stream.
    859 *
    860 * During the processing of a single stream, the caller must call `Decode*`
    861 * zero or more times with `aLast` set to `false` and then call `Decode*` at
    862 * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
    863 * the processing of the stream has ended. Otherwise, the caller must call
    864 * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
    865 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
    866 *
    867 * Once the stream has ended, the `Decoder` object must not be used anymore.
    868 * That is, you need to create another one to process another stream.
    869 *
    870 * When the decoder returns `kOutputFull` or the decoder returns a malformed
    871 * result and the caller does not wish to treat it as a fatal error, the input
    872 * buffer `aSrc` may not have been completely consumed. In that case, the caller
    873 * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
    874 * call.
    875 *
    876 * # Infinite loops
    877 *
    878 * When converting with a fixed-size output buffer whose size is too small to
    879 * accommodate one character of output, an infinite loop ensues. When
    880 * converting with a fixed-size output buffer, it generally makes sense to
    881 * make the buffer fairly large (e.g. couple of kilobytes).
    882 */
    883 class Decoder final {
    884 public:
    885  ~Decoder() = default;
    886  static void operator delete(void* aDecoder) {
    887    decoder_free(reinterpret_cast<Decoder*>(aDecoder));
    888  }
    889 
    890  /**
    891   * The `Encoding` this `Decoder` is for.
    892   *
    893   * BOM sniffing can change the return value of this method during the life
    894   * of the decoder.
    895   */
    896  inline NotNull<const mozilla::Encoding*> Encoding() const {
    897    return WrapNotNull(decoder_encoding(this));
    898  }
    899 
    900  /**
    901   * Query the worst-case UTF-8 output size _with replacement_.
    902   *
    903   * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
    904   * that will not overflow given the current state of the decoder and
    905   * `aByteLength` number of additional input bytes when decoding with
    906   * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
    907   * sequence.
    908   */
    909  inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const {
    910    CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
    911    if (max.value() == std::numeric_limits<size_t>::max()) {
    912      // Mark invalid by overflowing
    913      max++;
    914      MOZ_ASSERT(!max.isValid());
    915    }
    916    return max;
    917  }
    918 
    919  /**
    920   * Query the worst-case UTF-8 output size _without replacement_.
    921   *
    922   * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
    923   * that will not overflow given the current state of the decoder and
    924   * `aByteLength` number of additional input bytes when decoding without
    925   * replacement error handling.
    926   *
    927   * Note that this value may be too small for the `WithReplacement` case.
    928   * Use `MaxUTF8BufferLength()` for that case.
    929   */
    930  inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
    931      size_t aByteLength) const {
    932    CheckedInt<size_t> max(
    933        decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
    934    if (max.value() == std::numeric_limits<size_t>::max()) {
    935      // Mark invalid by overflowing
    936      max++;
    937      MOZ_ASSERT(!max.isValid());
    938    }
    939    return max;
    940  }
    941 
    942  /**
    943   * Incrementally decode a byte stream into UTF-8 with malformed sequences
    944   * replaced with the REPLACEMENT CHARACTER.
    945   *
    946   * See the documentation of the class for documentation for `Decode*`
    947   * methods collectively.
    948   */
    949  inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF8(
    950      Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
    951    size_t srcRead = aSrc.Length();
    952    size_t dstWritten = aDst.Length();
    953    bool hadReplacements;
    954    uint32_t result =
    955        decoder_decode_to_utf8(this, aSrc.Elements(), &srcRead, aDst.Elements(),
    956                               &dstWritten, aLast, &hadReplacements);
    957    return {result, srcRead, dstWritten, hadReplacements};
    958  }
    959 
    960  /**
    961   * Incrementally decode a byte stream into UTF-8 _without replacement_.
    962   *
    963   * See the documentation of the class for documentation for `Decode*`
    964   * methods collectively.
    965   */
    966  inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
    967      Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
    968    size_t srcRead = aSrc.Length();
    969    size_t dstWritten = aDst.Length();
    970    uint32_t result = decoder_decode_to_utf8_without_replacement(
    971        this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
    972    return {result, srcRead, dstWritten};
    973  }
    974 
    975  /**
    976   * Query the worst-case UTF-16 output size (with or without replacement).
    977   *
    978   * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
    979   * that will not overflow given the current state of the decoder and
    980   * `aByteLength` number of additional input bytes.
    981   *
    982   * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
    983   * return value of this method applies also in the
    984   * `_without_replacement` case.
    985   */
    986  inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const {
    987    CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
    988    if (max.value() == std::numeric_limits<size_t>::max()) {
    989      // Mark invalid by overflowing
    990      max++;
    991      MOZ_ASSERT(!max.isValid());
    992    }
    993    return max;
    994  }
    995 
    996  /**
    997   * Incrementally decode a byte stream into UTF-16 with malformed sequences
    998   * replaced with the REPLACEMENT CHARACTER.
    999   *
   1000   * See the documentation of the class for documentation for `Decode*`
   1001   * methods collectively.
   1002   */
   1003  inline std::tuple<uint32_t, size_t, size_t, bool> DecodeToUTF16(
   1004      Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
   1005    size_t srcRead = aSrc.Length();
   1006    size_t dstWritten = aDst.Length();
   1007    bool hadReplacements;
   1008    uint32_t result = decoder_decode_to_utf16(this, aSrc.Elements(), &srcRead,
   1009                                              aDst.Elements(), &dstWritten,
   1010                                              aLast, &hadReplacements);
   1011    return {result, srcRead, dstWritten, hadReplacements};
   1012  }
   1013 
   1014  /**
   1015   * Incrementally decode a byte stream into UTF-16 _without replacement_.
   1016   *
   1017   * See the documentation of the class for documentation for `Decode*`
   1018   * methods collectively.
   1019   */
   1020  inline std::tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
   1021      Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast) {
   1022    size_t srcRead = aSrc.Length();
   1023    size_t dstWritten = aDst.Length();
   1024    uint32_t result = decoder_decode_to_utf16_without_replacement(
   1025        this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
   1026    return {result, srcRead, dstWritten};
   1027  }
   1028 
   1029  /**
   1030   * Checks for compatibility with storing Unicode scalar values as unsigned
   1031   * bytes taking into account the state of the decoder.
   1032   *
   1033   * Returns `mozilla::Nothing()` if the decoder is not in a neutral state,
   1034   * including waiting for the BOM, or if the encoding is never
   1035   * Latin1-byte-compatible.
   1036   *
   1037   * Otherwise returns the index of the first byte whose unsigned value doesn't
   1038   * directly correspond to the decoded Unicode scalar value, or the length
   1039   * of the input if all bytes in the input decode directly to scalar values
   1040   * corresponding to the unsigned byte values.
   1041   *
   1042   * Does not change the state of the decoder.
   1043   *
   1044   * Do not use this unless you are supporting SpiderMonkey-style string
   1045   * storage optimizations.
   1046   */
   1047  inline mozilla::Maybe<size_t> Latin1ByteCompatibleUpTo(
   1048      Span<const uint8_t> aBuffer) const {
   1049    size_t upTo = decoder_latin1_byte_compatible_up_to(this, aBuffer.Elements(),
   1050                                                       aBuffer.Length());
   1051    if (upTo == std::numeric_limits<size_t>::max()) {
   1052      return mozilla::Nothing();
   1053    }
   1054    return mozilla::Some(upTo);
   1055  }
   1056 
   1057 private:
   1058  Decoder() = delete;
   1059  Decoder(const Decoder&) = delete;
   1060  Decoder& operator=(const Decoder&) = delete;
   1061 };
   1062 
   1063 /**
   1064 * A converter that encodes a Unicode stream into bytes according to a
   1065 * character encoding in a streaming (incremental) manner.
   1066 *
   1067 * The various `Encode*` methods take an input buffer (`aSrc`) and an output
   1068 * buffer `aDst` both of which are caller-allocated. There are variants for
   1069 * both UTF-8 and UTF-16 input buffers.
   1070 *
   1071 * An `Encode*` method encode characters from `aSrc` into bytes characters
   1072 * stored into `aDst` until one of the following three things happens:
   1073 *
   1074 * 1. An unmappable character is encountered (`*WithoutReplacement` variants
   1075 *    only).
   1076 *
   1077 * 2. The output buffer has been filled so near capacity that the decoder
   1078 *    cannot be sure that processing an additional character of input wouldn't
   1079 *    cause so much output that the output buffer would overflow.
   1080 *
   1081 * 3. All the input characters have been processed.
   1082 *
   1083 * The `Encode*` method then returns tuple of a status indicating which one
   1084 * of the three reasons to return happened, how many input code units (`uint8_t`
   1085 * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
   1086 * how many output bytes were written, and in the case of the variants that
   1087 * perform replacement, a boolean indicating whether an unmappable
   1088 * character was replaced with a numeric character reference during the call.
   1089 *
   1090 * The number of bytes "written" is what's logically written. Garbage may be
   1091 * written in the output buffer beyond the point logically written to.
   1092 *
   1093 * In the case of the methods whose name ends with
   1094 * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
   1095 * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
   1096 * to the three cases listed above).
   1097 *
   1098 * In the case of methods whose name does not end with
   1099 * `*WithoutReplacement`, unmappable characters are automatically replaced
   1100 * with the corresponding numeric character references and unmappable
   1101 * characters do not cause the methods to return early.
   1102 *
   1103 * When encoding from UTF-8 without replacement, the methods are guaranteed
   1104 * not to return indicating that more output space is needed if the length
   1105 * of the output buffer is at least the length returned by
   1106 * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
   1107 * UTF-8 with replacement, the length of the output buffer that guarantees the
   1108 * methods not to return indicating that more output space is needed in the
   1109 * absence of unmappable characters is given by
   1110 * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
   1111 * UTF-16 without replacement, the methods are guaranteed not to return
   1112 * indicating that more output space is needed if the length of the output
   1113 * buffer is at least the length returned by
   1114 * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
   1115 * from UTF-16 with replacement, the the length of the output buffer that
   1116 * guarantees the methods not to return indicating that more output space is
   1117 * needed in the absence of unmappable characters is given by
   1118 * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
   1119 * When encoding with replacement, applications are not expected to size the
   1120 * buffer for the worst case ahead of time but to resize the buffer if there
   1121 * are unmappable characters. This is why max length queries are only available
   1122 * for the case where there are no unmappable characters.
   1123 *
   1124 * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
   1125 * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
   1126 * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
   1127 * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
   1128 * surrogate pairs are not split across input buffer boundaries.
   1129 *
   1130 * After an `Encode*` call returns, the output produced so far, taken as a
   1131 * whole from the start of the stream, is guaranteed to consist of a valid
   1132 * byte sequence in the target encoding. (I.e. the code unit sequence for a
   1133 * character is guaranteed not to be split across output buffers. However, due
   1134 * to the stateful nature of ISO-2022-JP, the stream needs to be considered
   1135 * from the start for it to be valid. For other encodings, the validity holds
   1136 * on a per-output buffer basis.)
   1137 *
   1138 * The boolean argument `aLast` indicates that the end of the stream is reached
   1139 * when all the characters in `aSrc` have been consumed. This argument is needed
   1140 * for ISO-2022-JP and is ignored for other encodings.
   1141 *
   1142 * An `Encoder` object can be used to incrementally encode a byte stream.
   1143 *
   1144 * During the processing of a single stream, the caller must call `Encode*`
   1145 * zero or more times with `aLast` set to `false` and then call `Encode*` at
   1146 * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
   1147 * the processing of the stream has ended. Otherwise, the caller must call
   1148 * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
   1149 * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
   1150 *
   1151 * Once the stream has ended, the `Encoder` object must not be used anymore.
   1152 * That is, you need to create another one to process another stream.
   1153 *
   1154 * When the encoder returns `kOutputFull` or the encoder returns an unmappable
   1155 * result and the caller does not wish to treat it as a fatal error, the input
   1156 * buffer `aSrc` may not have been completely consumed. In that case, the caller
   1157 * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
   1158 * call.
   1159 *
   1160 * # Infinite loops
   1161 *
   1162 * When converting with a fixed-size output buffer whose size is too small to
   1163 * accommodate one character of output, an infinite loop ensues. When
   1164 * converting with a fixed-size output buffer, it generally makes sense to
   1165 * make the buffer fairly large (e.g. couple of kilobytes).
   1166 */
   1167 class Encoder final {
   1168 public:
   1169  ~Encoder() = default;
   1170 
   1171  static void operator delete(void* aEncoder) {
   1172    encoder_free(reinterpret_cast<Encoder*>(aEncoder));
   1173  }
   1174 
   1175  /**
   1176   * The `Encoding` this `Encoder` is for.
   1177   */
   1178  inline NotNull<const mozilla::Encoding*> Encoding() const {
   1179    return WrapNotNull(encoder_encoding(this));
   1180  }
   1181 
   1182  /**
   1183   * Returns `true` if this is an ISO-2022-JP encoder that's not in the
   1184   * ASCII state and `false` otherwise.
   1185   */
   1186  inline bool HasPendingState() const {
   1187    return encoder_has_pending_state(this);
   1188  }
   1189 
   1190  /**
   1191   * Query the worst-case output size when encoding from UTF-8 with
   1192   * replacement.
   1193   *
   1194   * Returns the size of the output buffer in bytes that will not overflow
   1195   * given the current state of the encoder and `aByteLength` number of
   1196   * additional input code units if there are no unmappable characters in
   1197   * the input.
   1198   */
   1199  inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
   1200      size_t aByteLength) const {
   1201    CheckedInt<size_t> max(
   1202        encoder_max_buffer_length_from_utf8_if_no_unmappables(this,
   1203                                                              aByteLength));
   1204    if (max.value() == std::numeric_limits<size_t>::max()) {
   1205      // Mark invalid by overflowing
   1206      max++;
   1207      MOZ_ASSERT(!max.isValid());
   1208    }
   1209    return max;
   1210  }
   1211 
   1212  /**
   1213   * Query the worst-case output size when encoding from UTF-8 without
   1214   * replacement.
   1215   *
   1216   * Returns the size of the output buffer in bytes that will not overflow
   1217   * given the current state of the encoder and `aByteLength` number of
   1218   * additional input code units.
   1219   */
   1220  inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
   1221      size_t aByteLength) const {
   1222    CheckedInt<size_t> max(
   1223        encoder_max_buffer_length_from_utf8_without_replacement(this,
   1224                                                                aByteLength));
   1225    if (max.value() == std::numeric_limits<size_t>::max()) {
   1226      // Mark invalid by overflowing
   1227      max++;
   1228      MOZ_ASSERT(!max.isValid());
   1229    }
   1230    return max;
   1231  }
   1232 
   1233  /**
   1234   * Incrementally encode into byte stream from UTF-8 with unmappable
   1235   * characters replaced with HTML (decimal) numeric character references.
   1236   *
   1237   * See the documentation of the class for documentation for `Encode*`
   1238   * methods collectively.
   1239   *
   1240   * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
   1241   * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
   1242   * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
   1243   */
   1244  inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF8(
   1245      Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
   1246    size_t srcRead = aSrc.Length();
   1247    size_t dstWritten = aDst.Length();
   1248    bool hadReplacements;
   1249    uint32_t result = encoder_encode_from_utf8(this, aSrc.Elements(), &srcRead,
   1250                                               aDst.Elements(), &dstWritten,
   1251                                               aLast, &hadReplacements);
   1252    return {result, srcRead, dstWritten, hadReplacements};
   1253  }
   1254 
   1255  /**
   1256   * Incrementally encode into byte stream from UTF-8 _without replacement_.
   1257   *
   1258   * See the documentation of the class for documentation for `Encode*`
   1259   * methods collectively.
   1260   *
   1261   * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
   1262   * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
   1263   * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
   1264   */
   1265  inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
   1266      Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast) {
   1267    size_t srcRead = aSrc.Length();
   1268    size_t dstWritten = aDst.Length();
   1269    uint32_t result = encoder_encode_from_utf8_without_replacement(
   1270        this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
   1271    return {result, srcRead, dstWritten};
   1272  }
   1273 
   1274  /**
   1275   * Query the worst-case output size when encoding from UTF-16 with
   1276   * replacement.
   1277   *
   1278   * Returns the size of the output buffer in bytes that will not overflow
   1279   * given the current state of the encoder and `aU16Length` number of
   1280   * additional input code units if there are no unmappable characters in
   1281   * the input.
   1282   */
   1283  inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
   1284      size_t aU16Length) const {
   1285    CheckedInt<size_t> max(
   1286        encoder_max_buffer_length_from_utf16_if_no_unmappables(this,
   1287                                                               aU16Length));
   1288    if (max.value() == std::numeric_limits<size_t>::max()) {
   1289      // Mark invalid by overflowing
   1290      max++;
   1291      MOZ_ASSERT(!max.isValid());
   1292    }
   1293    return max;
   1294  }
   1295 
   1296  /**
   1297   * Query the worst-case output size when encoding from UTF-16 without
   1298   * replacement.
   1299   *
   1300   * Returns the size of the output buffer in bytes that will not overflow
   1301   * given the current state of the encoder and `aU16Length` number of
   1302   * additional input code units.
   1303   */
   1304  inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
   1305      size_t aU16Length) const {
   1306    CheckedInt<size_t> max(
   1307        encoder_max_buffer_length_from_utf16_without_replacement(this,
   1308                                                                 aU16Length));
   1309    if (max.value() == std::numeric_limits<size_t>::max()) {
   1310      // Mark invalid by overflowing
   1311      max++;
   1312      MOZ_ASSERT(!max.isValid());
   1313    }
   1314    return max;
   1315  }
   1316 
   1317  /**
   1318   * Incrementally encode into byte stream from UTF-16 with unmappable
   1319   * characters replaced with HTML (decimal) numeric character references.
   1320   *
   1321   * See the documentation of the class for documentation for `Encode*`
   1322   * methods collectively.
   1323   */
   1324  inline std::tuple<uint32_t, size_t, size_t, bool> EncodeFromUTF16(
   1325      Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
   1326    size_t srcRead = aSrc.Length();
   1327    size_t dstWritten = aDst.Length();
   1328    bool hadReplacements;
   1329    uint32_t result = encoder_encode_from_utf16(this, aSrc.Elements(), &srcRead,
   1330                                                aDst.Elements(), &dstWritten,
   1331                                                aLast, &hadReplacements);
   1332    return {result, srcRead, dstWritten, hadReplacements};
   1333  }
   1334 
   1335  /**
   1336   * Incrementally encode into byte stream from UTF-16 _without replacement_.
   1337   *
   1338   * See the documentation of the class for documentation for `Encode*`
   1339   * methods collectively.
   1340   */
   1341  inline std::tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
   1342      Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast) {
   1343    size_t srcRead = aSrc.Length();
   1344    size_t dstWritten = aDst.Length();
   1345    uint32_t result = encoder_encode_from_utf16_without_replacement(
   1346        this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
   1347    return {result, srcRead, dstWritten};
   1348  }
   1349 
   1350 private:
   1351  Encoder() = delete;
   1352  Encoder(const Encoder&) = delete;
   1353  Encoder& operator=(const Encoder&) = delete;
   1354 };
   1355 
   1356 };  // namespace mozilla
   1357 
   1358 #endif  // mozilla_Encoding_h
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE