EncodingDetector.h (5673B)
1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 // Mostly copied and pasted from 11 // third_party/rust/chardetng/src/lib.rs , so 12 // "top-level directory of this distribution" above refers to 13 // third_party/rust/chardetng/ 14 15 #ifndef mozilla_EncodingDetector_h 16 #define mozilla_EncodingDetector_h 17 18 #include "mozilla/Encoding.h" 19 20 namespace mozilla { 21 class EncodingDetector; 22 }; // namespace mozilla 23 24 #define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector 25 26 #include "chardetng.h" 27 28 namespace mozilla { 29 30 /** 31 * A Web browser-oriented detector for guessing what character 32 * encoding a stream of bytes is encoded in. 33 * 34 * The bytes are fed to the detector incrementally using the `feed` 35 * method. The current guess of the detector can be queried using 36 * the `guess` method. The guessing parameters are arguments to the 37 * `guess` method rather than arguments to the constructor in order 38 * to enable the application to check if the arguments affect the 39 * guessing outcome. (The specific use case is to disable UI for 40 * re-running the detector with UTF-8 allowed and the top-level 41 * domain name ignored if those arguments don't change the guess.) 42 */ 43 class EncodingDetector final { 44 public: 45 ~EncodingDetector() = default; 46 47 static void operator delete(void* aDetector) { 48 chardetng_encoding_detector_free( 49 reinterpret_cast<EncodingDetector*>(aDetector)); 50 } 51 52 /** 53 * Creates a new instance of the detector. 54 */ 55 static inline UniquePtr<EncodingDetector> Create() { 56 UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new()); 57 return detector; 58 } 59 60 /** 61 * Queries whether the TLD is considered non-generic and could affect the 62 * guess. 63 */ 64 static inline bool TldMayAffectGuess(Span<const char> aTLD) { 65 return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(), 66 aTLD.Length()); 67 } 68 69 /** 70 * Inform the detector of a chunk of input. 71 * 72 * The byte stream is represented as a sequence of calls to this 73 * method such that the concatenation of the arguments to this 74 * method form the byte stream. It does not matter how the application 75 * chooses to chunk the stream. It is OK to call this method with 76 * a zero-length byte slice. 77 * 78 * The end of the stream is indicated by calling this method with 79 * `aLast` set to `true`. In that case, the end of the stream is 80 * considered to occur after the last byte of the `aBuffer` (which 81 * may be zero-length) passed in the same call. Once this method 82 * has been called with `last` set to `true` this method must not 83 * be called again. 84 * 85 * If you want to perform detection on just the prefix of a longer 86 * stream, do not pass `aLast=true` after the prefix if the stream 87 * actually still continues. 88 * 89 * Returns `true` if after processing `aBuffer` the stream has 90 * contained at least one non-ASCII byte and `false` if only 91 * ASCII has been seen so far. 92 * 93 * # Panics 94 * 95 * If this method has previously been called with `aLast` set to `true`. 96 */ 97 inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) { 98 return chardetng_encoding_detector_feed(this, aBuffer.Elements(), 99 aBuffer.Length(), aLast); 100 } 101 102 /** 103 * Guess the encoding given the bytes pushed to the detector so far 104 * (via `Feed()`), the top-level domain name from which the bytes were 105 * loaded, and an indication of whether to consider UTF-8 as a permissible 106 * guess. 107 * 108 * The `aTld` argument takes the rightmost DNS label of the hostname of the 109 * host the stream was loaded from in lower-case ASCII form. That is, if 110 * the label is an internationalized top-level domain name, it must be 111 * provided in its Punycode form. If the TLD that the stream was loaded 112 * from is unavalable, an empty `Spane` may be passed instead, which is 113 * equivalent to passing a `Span` for "com". 114 * 115 * If the `aAllowUTF8` argument is set to `false`, the return value of 116 * this method won't be `UTF_8_ENCODING`. When performing detection 117 * on `text/html` on non-`file:` URLs, Web browsers must pass `false`, 118 * unless the user has taken a specific contextual action to request an 119 * override. This way, Web developers cannot start depending on UTF-8 120 * detection. Such reliance would make the Web Platform more brittle. 121 * 122 * Returns the guessed encoding. 123 * 124 * # Panics 125 * 126 * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic 127 * condition is intentionally limited to signs of failing to extract the 128 * label correctly, failing to provide it in its Punycode form, and failure 129 * to lower-case it. Full DNS label validation is intentionally not performed 130 * to avoid panics when the reality doesn't match the specs.) 131 */ 132 inline mozilla::NotNull<const mozilla::Encoding*> Guess( 133 Span<const char> aTLD, bool aAllowUTF8) const { 134 return WrapNotNull(chardetng_encoding_detector_guess( 135 this, aTLD.Elements(), aTLD.Length(), aAllowUTF8)); 136 } 137 138 private: 139 EncodingDetector() = delete; 140 EncodingDetector(const EncodingDetector&) = delete; 141 EncodingDetector& operator=(const EncodingDetector&) = delete; 142 }; 143 144 }; // namespace mozilla 145 146 #endif // mozilla_EncodingDetector_h