tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

EncodingDetector.h (5673B)


      1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT
      2 // file at the top-level directory of this distribution.
      3 //
      4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
      5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
      6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
      7 // option. This file may not be copied, modified, or distributed
      8 // except according to those terms.
      9 
     10 // Mostly copied and pasted from
     11 // third_party/rust/chardetng/src/lib.rs , so
     12 // "top-level directory of this distribution" above refers to
     13 // third_party/rust/chardetng/
     14 
     15 #ifndef mozilla_EncodingDetector_h
     16 #define mozilla_EncodingDetector_h
     17 
     18 #include "mozilla/Encoding.h"
     19 
     20 namespace mozilla {
     21 class EncodingDetector;
     22 };  // namespace mozilla
     23 
     24 #define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector
     25 
     26 #include "chardetng.h"
     27 
     28 namespace mozilla {
     29 
     30 /**
     31 * A Web browser-oriented detector for guessing what character
     32 * encoding a stream of bytes is encoded in.
     33 *
     34 * The bytes are fed to the detector incrementally using the `feed`
     35 * method. The current guess of the detector can be queried using
     36 * the `guess` method. The guessing parameters are arguments to the
     37 * `guess` method rather than arguments to the constructor in order
     38 * to enable the application to check if the arguments affect the
     39 * guessing outcome. (The specific use case is to disable UI for
     40 * re-running the detector with UTF-8 allowed and the top-level
     41 * domain name ignored if those arguments don't change the guess.)
     42 */
     43 class EncodingDetector final {
     44 public:
     45  ~EncodingDetector() = default;
     46 
     47  static void operator delete(void* aDetector) {
     48    chardetng_encoding_detector_free(
     49        reinterpret_cast<EncodingDetector*>(aDetector));
     50  }
     51 
     52  /**
     53   * Creates a new instance of the detector.
     54   */
     55  static inline UniquePtr<EncodingDetector> Create() {
     56    UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
     57    return detector;
     58  }
     59 
     60  /**
     61   * Queries whether the TLD is considered non-generic and could affect the
     62   * guess.
     63   */
     64  static inline bool TldMayAffectGuess(Span<const char> aTLD) {
     65    return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(),
     66                                                            aTLD.Length());
     67  }
     68 
     69  /**
     70   * Inform the detector of a chunk of input.
     71   *
     72   * The byte stream is represented as a sequence of calls to this
     73   * method such that the concatenation of the arguments to this
     74   * method form the byte stream. It does not matter how the application
     75   * chooses to chunk the stream. It is OK to call this method with
     76   * a zero-length byte slice.
     77   *
     78   * The end of the stream is indicated by calling this method with
     79   * `aLast` set to `true`. In that case, the end of the stream is
     80   * considered to occur after the last byte of the `aBuffer` (which
     81   * may be zero-length) passed in the same call. Once this method
     82   * has been called with `last` set to `true` this method must not
     83   * be called again.
     84   *
     85   * If you want to perform detection on just the prefix of a longer
     86   * stream, do not pass `aLast=true` after the prefix if the stream
     87   * actually still continues.
     88   *
     89   * Returns `true` if after processing `aBuffer` the stream has
     90   * contained at least one non-ASCII byte and `false` if only
     91   * ASCII has been seen so far.
     92   *
     93   * # Panics
     94   *
     95   * If this method has previously been called with `aLast` set to `true`.
     96   */
     97  inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
     98    return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
     99                                            aBuffer.Length(), aLast);
    100  }
    101 
    102  /**
    103   * Guess the encoding given the bytes pushed to the detector so far
    104   * (via `Feed()`), the top-level domain name from which the bytes were
    105   * loaded, and an indication of whether to consider UTF-8 as a permissible
    106   * guess.
    107   *
    108   * The `aTld` argument takes the rightmost DNS label of the hostname of the
    109   * host the stream was loaded from in lower-case ASCII form. That is, if
    110   * the label is an internationalized top-level domain name, it must be
    111   * provided in its Punycode form. If the TLD that the stream was loaded
    112   * from is unavalable, an empty `Spane` may be passed instead, which is
    113   * equivalent to passing a `Span` for "com".
    114   *
    115   * If the `aAllowUTF8` argument is set to `false`, the return value of
    116   * this method won't be `UTF_8_ENCODING`. When performing detection
    117   * on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
    118   * unless the user has taken a specific contextual action to request an
    119   * override. This way, Web developers cannot start depending on UTF-8
    120   * detection. Such reliance would make the Web Platform more brittle.
    121   *
    122   * Returns the guessed encoding.
    123   *
    124   * # Panics
    125   *
    126   * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
    127   * condition is intentionally limited to signs of failing to extract the
    128   * label correctly, failing to provide it in its Punycode form, and failure
    129   * to lower-case it. Full DNS label validation is intentionally not performed
    130   * to avoid panics when the reality doesn't match the specs.)
    131   */
    132  inline mozilla::NotNull<const mozilla::Encoding*> Guess(
    133      Span<const char> aTLD, bool aAllowUTF8) const {
    134    return WrapNotNull(chardetng_encoding_detector_guess(
    135        this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
    136  }
    137 
    138 private:
    139  EncodingDetector() = delete;
    140  EncodingDetector(const EncodingDetector&) = delete;
    141  EncodingDetector& operator=(const EncodingDetector&) = delete;
    142 };
    143 
    144 };  // namespace mozilla
    145 
    146 #endif  // mozilla_EncodingDetector_h