[ tor-browser ].git.dasho

Utf8.h (25263B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /*
      8 * UTF-8-related functionality, including a type-safe structure representing a
      9 * UTF-8 code unit.
     10 */
     11 
     12 #ifndef mozilla_Utf8_h
     13 #define mozilla_Utf8_h
     14 
     15 #include "mozilla/Casting.h"    // for mozilla::AssertedCast
     16 #include "mozilla/Likely.h"     // for MOZ_UNLIKELY
     17 #include "mozilla/Maybe.h"      // for mozilla::Maybe
     18 #include "mozilla/Span.h"       // for mozilla::Span
     19 #include "mozilla/TextUtils.h"  // for mozilla::IsAscii and via Latin1.h for
     20                                // encoding_rs_mem.h and MOZ_HAS_JSRUST.
     21 #include "mozilla/Types.h"      // for MFBT_API
     22 
     23 #include <limits>    // for std::numeric_limits
     24 #include <limits.h>  // for CHAR_BIT
     25 #include <stddef.h>  // for size_t
     26 #include <stdint.h>  // for uint8_t
     27 
     28 #if MOZ_HAS_JSRUST()
     29 // Can't include mozilla/Encoding.h here.
     30 extern "C" {
     31 // Declared as uint8_t instead of char to match declaration in another header.
     32 size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
     33 }
     34 #else
     35 namespace mozilla {
     36 namespace detail {
     37 extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount);
     38 };  // namespace detail
     39 };  // namespace mozilla
     40 #endif  // MOZ_HAS_JSRUST
     41 
     42 namespace mozilla {
     43 
     44 union Utf8Unit;
     45 
     46 static_assert(CHAR_BIT == 8,
     47              "Utf8Unit won't work so well with non-octet chars");
     48 
     49 /**
     50 * A code unit within a UTF-8 encoded string.  (A code unit is the smallest
     51 * unit within the Unicode encoding of a string.  For UTF-8 this is an 8-bit
     52 * number; for UTF-16 it would be a 16-bit number.)
     53 *
     54 * This is *not* the same as a single code point: in UTF-8, non-ASCII code
     55 * points are constituted by multiple code units.
     56 */
     57 union Utf8Unit {
     58 private:
     59  // Utf8Unit is a union wrapping a raw |char|.  The C++ object model and C++
     60  // requirements as to how objects may be accessed with respect to their actual
     61  // types (almost?) uniquely compel this choice.
     62  //
     63  // Our requirements for a UTF-8 code unit representation are:
     64  //
     65  //   1. It must be "compatible" with C++ character/string literals that use
     66  //      the UTF-8 encoding.  Given a properly encoded C++ literal, you should
     67  //      be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
     68  //      and friends (particularly UnicodeData), you should be able to access
     69  //      C++ character types for their contents.
     70  //   2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
     71  //      explicit operation.
     72  //   3. |Utf8Unit| must participate in overload resolution and template type
     73  //      equivalence (that is, given |template<class> class X|, when |X<T>| and
     74  //      |X<U>| are the same type) distinctly from the C++ character types.
     75  //
     76  // And a few nice-to-haves (at least for the moment):
     77  //
     78  //   4. The representation should use unsigned numbers, to avoid undefined
     79  //      behavior that can arise with signed types, and because Unicode code
     80  //      points and code units are unsigned.
     81  //   5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
     82  //      and |unsigned char*|, for APIs that (because of #4 above) use those
     83  //      types as the "natural" choice for UTF-8 data.
     84  //
     85  // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
     86  // |{,{un,}signed} char|.[0]  |uint8_t| won't work because it might not be a
     87  // C++ character type.
     88  //
     89  // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
     90  // typedefs don't generate *new* types, just type aliases).  This requires a
     91  // compound type.
     92  //
     93  // The ultimate representation (and character type in it) is constrained by
     94  // C++14 [basic.lval]p10 that defines how objects may be accessed, with
     95  // respect to the dynamic type in memory and the actual type used to access
     96  // them.  It reads:
     97  //
     98  //     If a program attempts to access the stored value of an object
     99  //     through a glvalue of other than one of the following types the
    100  //     behavior is undefined:
    101  //
    102  //       1. the dynamic type of the object,
    103  //       2. a cv-qualified version of the dynamic type of the object,
    104  //       ...other types irrelevant here...
    105  //       3. an aggregate or union type that includes one of the
    106  //          aforementioned types among its elements or non-static data
    107  //          members (including, recursively, an element or non-static
    108  //          data member of a subaggregate or contained union),
    109  //       ...more irrelevant types...
    110  //       4. a char or unsigned char type.
    111  //
    112  // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
    113  // matter the representation by #4.  (Briefly set aside what values are seen.)
    114  // (And #2 allows |const| on either the dynamic type or the accessing type.)
    115  // (|signed char| is really only useful for small signed numbers, not
    116  // characters, so we ignore it.)
    117  //
    118  // If we interpret contents as |char|/|unsigned char| contrary to the actual
    119  // type stored there, what happens?  C++14 [basic.fundamental]p1 requires
    120  // character types be identically aligned/sized; C++14 [basic.fundamental]p3
    121  // requires |signed char| and |unsigned char| have the same value
    122  // representation.  C++ doesn't require identical bitwise representation, tho.
    123  // Practically we could assume it, but this verges on C++ spec bits best not
    124  // *relied* on for correctness, if possible.
    125  //
    126  // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
    127  // and |char*|.  Instead we safely expose |unsigned char| by fully-defined
    128  // *integral conversion* (C++14 [conv.integral]p2).  Integral conversion from
    129  // |unsigned char| → |char| has only implementation-defined behavior.  It'd be
    130  // better not to depend on that, but given twos-complement won, it should be
    131  // okay.  (Also |unsigned char*| is awkward enough to work with for strings
    132  // that it probably doesn't appear in string manipulation much anyway, only in
    133  // places that should really use |Utf8Unit| directly.)
    134  //
    135  // The opposite direction -- interpreting |char| or |char*| data through
    136  // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
    137  // decided above, using #3.  An "aggregate or union" will work that contains a
    138  // |char|.  Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
    139  // aggregates must have "no private or protected non-static data members", and
    140  // we want to keep the inner |char| hidden.  So a |struct| is out, and only
    141  // |union| remains.
    142  //
    143  // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
    144  // can't make |Utf8Unit| an enum class with |char| underlying type, because we
    145  // are given no license to treat |char| memory as such an |enum|'s memory.)
    146  //
    147  // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
    148  // This satisfies all our requirements.  It also supports the nice-to-haves of
    149  // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
    150  // |unsigned char|.  It doesn't satisfy the nice-to-haves of using an
    151  // |unsigned char| internally, nor of letting us wrap an existing
    152  // |unsigned char| or pointer to one.  We probably *could* do these, if we
    153  // were willing to rely harder on implementation-defined behaviors, but for
    154  // now we privilege C++'s main character type over some conceptual purity.
    155  //
    156  // 0. There's a proposal for a UTF-8 character type distinct from the existing
    157  //    C++ narrow character types:
    158  //
    159  //      http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
    160  //
    161  //    but it hasn't been standardized (and might never be), and none of the
    162  //    compilers we really care about have implemented it.  Maybe someday we
    163  //    can change our implementation to it without too much trouble, if we're
    164  //    lucky...
    165  char mValue = '\0';
    166 
    167 public:
    168  Utf8Unit() = default;
    169 
    170  explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {}
    171 
    172  explicit constexpr Utf8Unit(unsigned char aUnit)
    173      : mValue(static_cast<char>(aUnit)) {
    174    // Per the above comment, the prior cast is integral conversion with
    175    // implementation-defined semantics, and we regretfully but unavoidably
    176    // assume the conversion does what we want it to.
    177  }
    178 
    179 #ifdef __cpp_char8_t
    180  explicit constexpr Utf8Unit(char8_t aUnit)
    181      : mValue(static_cast<char>(aUnit)) {}
    182 #endif
    183 
    184  constexpr bool operator==(const Utf8Unit& aOther) const {
    185    return mValue == aOther.mValue;
    186  }
    187 
    188  constexpr bool operator!=(const Utf8Unit& aOther) const {
    189    return !(*this == aOther);
    190  }
    191 
    192  /** Convert a UTF-8 code unit to a raw char. */
    193  constexpr char toChar() const {
    194    // Only a |char| is ever permitted to be written into this location, so this
    195    // is both permissible and returns the desired value.
    196    return mValue;
    197  }
    198 
    199  /** Convert a UTF-8 code unit to a raw unsigned char. */
    200  constexpr unsigned char toUnsignedChar() const {
    201    // Per the above comment, this is well-defined integral conversion.
    202    return static_cast<unsigned char>(mValue);
    203  }
    204 
    205  /** Convert a UTF-8 code unit to a uint8_t. */
    206  constexpr uint8_t toUint8() const {
    207    // Per the above comment, this is well-defined integral conversion.
    208    return static_cast<uint8_t>(mValue);
    209  }
    210 
    211  // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
    212  // that's a somewhat separate concern, justified in different comments in
    213  // that other code.
    214 };
    215 
    216 /**
    217 * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
    218 *
    219 * Assuming proper backing has been set up, the resulting |const unsigned char*|
    220 * may validly be dereferenced.
    221 *
    222 * No access is provided to mutate this underlying memory as |unsigned char|.
    223 * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
    224 * loath to offer a way to write non-|char| data until absolutely necessary.
    225 */
    226 inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) {
    227  static_assert(sizeof(Utf8Unit) == sizeof(unsigned char),
    228                "sizes must match to permissibly reinterpret_cast<>");
    229  static_assert(alignof(Utf8Unit) == alignof(unsigned char),
    230                "alignment must match to permissibly reinterpret_cast<>");
    231 
    232  // The static_asserts above only enable the reinterpret_cast<> to occur.
    233  //
    234  // Dereferencing the resulting pointer is a separate question.  Any object's
    235  // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
    236  // this doesn't guarantee what values will be observed.  If |char| is
    237  // implemented to act like |unsigned char|, we're good to go: memory for the
    238  // |char| in |Utf8Unit| acts as we need.  But if |char| is implemented to act
    239  // like |signed char|, dereferencing produces the right value only if the
    240  // |char| types all use two's-complement representation.  Every modern
    241  // compiler does this, and there's a C++ proposal to standardize it.
    242  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html   So
    243  // *technically* this is implementation-defined -- but everyone does it and
    244  // this behavior is being standardized.
    245  return reinterpret_cast<const unsigned char*>(aUnits);
    246 }
    247 
    248 /** Returns true iff |aUnit| is an ASCII value. */
    249 constexpr bool IsAscii(Utf8Unit aUnit) {
    250  return IsAscii(aUnit.toUnsignedChar());
    251 }
    252 
    253 /**
    254 * Return true if the given span of memory consists of a valid UTF-8
    255 * string and false otherwise.
    256 *
    257 * The string *may* contain U+0000 NULL code points.
    258 */
    259 inline bool IsUtf8(mozilla::Span<const char> aString) {
    260 #if MOZ_HAS_JSRUST()
    261  size_t length = aString.Length();
    262  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
    263  // For short strings, the function call is a pessimization, and the SIMD
    264  // code won't have a chance to kick in anyway.
    265  if (length < 16) {
    266    for (size_t i = 0; i < length; i++) {
    267      if (ptr[i] >= 0x80U) {
    268        ptr += i;
    269        length -= i;
    270        goto end;
    271      }
    272    }
    273    return true;
    274  }
    275 end:
    276  return length == encoding_utf8_valid_up_to(ptr, length);
    277 #else
    278  return detail::IsValidUtf8(aString.Elements(), aString.Length());
    279 #endif
    280 }
    281 
    282 #if MOZ_HAS_JSRUST()
    283 
    284 // See Latin1.h for conversions between Latin1 and UTF-8.
    285 
    286 /**
    287 * Returns the index of the start of the first malformed byte
    288 * sequence or the length of the string if there are none.
    289 */
    290 inline size_t Utf8ValidUpTo(mozilla::Span<const char> aString) {
    291  return encoding_utf8_valid_up_to(
    292      reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
    293 }
    294 
    295 /**
    296 * Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates
    297 * with the REPLACEMENT CHARACTER.
    298 *
    299 * The length of aDest must be at least the length of aSource times three.
    300 *
    301 * Returns the number of code units written.
    302 */
    303 inline size_t ConvertUtf16toUtf8(mozilla::Span<const char16_t> aSource,
    304                                 mozilla::Span<char> aDest) {
    305  return encoding_mem_convert_utf16_to_utf8(
    306      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
    307 }
    308 
    309 /**
    310 * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
    311 * sequences with the REPLACEMENT CHARACTER with potentially insufficient
    312 * output space.
    313 *
    314 * Returns the number of code units read and the number of bytes written.
    315 *
    316 * If the output isn't large enough, not all input is consumed.
    317 *
    318 * The conversion is guaranteed to be complete if the length of aDest is
    319 * at least the length of aSource times three.
    320 *
    321 * The output is always valid UTF-8 ending on scalar value boundary
    322 * even in the case of partial conversion.
    323 *
    324 * The semantics of this function match the semantics of
    325 * TextEncoder.encodeInto.
    326 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
    327 */
    328 inline std::tuple<size_t, size_t> ConvertUtf16toUtf8Partial(
    329    mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest) {
    330  size_t srcLen = aSource.Length();
    331  size_t dstLen = aDest.Length();
    332  encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen,
    333                                             aDest.Elements(), &dstLen);
    334  return std::make_tuple(srcLen, dstLen);
    335 }
    336 
    337 /**
    338 * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
    339 * sequences with the REPLACEMENT CHARACTER.
    340 *
    341 * Returns the number of code units written.
    342 *
    343 * The length of aDest must be at least one greater than the length of aSource
    344 * even though the last slot isn't written to.
    345 *
    346 * If you know that the input is valid for sure, use
    347 * UnsafeConvertValidUtf8toUtf16() instead.
    348 */
    349 inline size_t ConvertUtf8toUtf16(mozilla::Span<const char> aSource,
    350                                 mozilla::Span<char16_t> aDest) {
    351  return encoding_mem_convert_utf8_to_utf16(
    352      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
    353 }
    354 
    355 /**
    356 * Converts known-valid UTF-8 to UTF-16. If the input might be invalid,
    357 * use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead.
    358 *
    359 * Returns the number of code units written.
    360 *
    361 * The length of aDest must be at least the length of aSource.
    362 */
    363 inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span<const char> aSource,
    364                                            mozilla::Span<char16_t> aDest) {
    365  return encoding_mem_convert_str_to_utf16(aSource.Elements(), aSource.Length(),
    366                                           aDest.Elements(), aDest.Length());
    367 }
    368 
    369 /**
    370 * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
    371 *
    372 * Returns the number of code units written or `mozilla::Nothing` if the
    373 * input was invalid.
    374 *
    375 * The length of the destination buffer must be at least the length of the
    376 * source buffer.
    377 *
    378 * When the input was invalid, some output may have been written.
    379 *
    380 * If you know that the input is valid for sure, use
    381 * UnsafeConvertValidUtf8toUtf16() instead.
    382 */
    383 inline mozilla::Maybe<size_t> ConvertUtf8toUtf16WithoutReplacement(
    384    mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest) {
    385  size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement(
    386      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
    387  if (MOZ_UNLIKELY(written == std::numeric_limits<size_t>::max())) {
    388    return mozilla::Nothing();
    389  }
    390  return mozilla::Some(written);
    391 }
    392 
    393 #endif  // MOZ_HAS_JSRUST
    394 
    395 /**
    396 * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
    397 * 0b10xx'xxxx.
    398 */
    399 inline bool IsTrailingUnit(Utf8Unit aUnit) {
    400  return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
    401 }
    402 
    403 /**
    404 * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
    405 * that (initially) itself points one unit past |aLeadUnit|, and
    406 * |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared
    407 * against |*aIter| using |aEnd - *aIter|:
    408 *
    409 * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
    410 * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
    411 * surrogate, in shortest form -- then return Some(that code point) and advance
    412 * |*aIter| past those code units.
    413 *
    414 * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
    415 * Nothing().
    416 *
    417 * |Iter| and |EndIter| are generalized concepts most easily understood as if
    418 * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
    419 * iterators that when dereferenced can be used to construct a |Utf8Unit| and
    420 * that can be compared and modified in certain limited ways.  (Carefully note
    421 * that this function mutates |*aIter|.)  |Iter| and |EndIter| are template
    422 * parameters to support more-complicated adaptor iterators.
    423 *
    424 * The template parameters after |Iter| allow users to implement custom handling
    425 * for various forms of invalid UTF-8.  A version of this function that defaults
    426 * all such handling to no-ops is defined below this function.  To learn how to
    427 * define your own custom handling, consult the implementation of that function,
    428 * which documents exactly how custom handler functors are invoked.
    429 *
    430 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
    431 * of this function without the "Inline" suffix on the name.
    432 */
    433 template <typename Iter, typename EndIter, class OnBadLeadUnit,
    434          class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
    435          class OnNotShortestForm>
    436 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
    437    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
    438    OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
    439    OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
    440    OnNotShortestForm aOnNotShortestForm) {
    441  MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);
    442 
    443  char32_t n = aLeadUnit.toUint8();
    444  MOZ_ASSERT(!IsAscii(n));
    445 
    446  // |aLeadUnit| determines the number of trailing code units in the code point
    447  // and the bits of |aLeadUnit| that contribute to the code point's value.
    448  uint8_t remaining;
    449  uint32_t min;
    450  if ((n & 0b1110'0000) == 0b1100'0000) {
    451    remaining = 1;
    452    min = 0x80;
    453    n &= 0b0001'1111;
    454  } else if ((n & 0b1111'0000) == 0b1110'0000) {
    455    remaining = 2;
    456    min = 0x800;
    457    n &= 0b0000'1111;
    458  } else if ((n & 0b1111'1000) == 0b1111'0000) {
    459    remaining = 3;
    460    min = 0x10000;
    461    n &= 0b0000'0111;
    462  } else {
    463    *aIter -= 1;
    464    aOnBadLeadUnit();
    465    return Nothing();
    466  }
    467 
    468  // If the code point would require more code units than remain, the encoding
    469  // is invalid.
    470  auto actual = aEnd - *aIter;
    471  if (MOZ_UNLIKELY(actual < remaining)) {
    472    *aIter -= 1;
    473    aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
    474    return Nothing();
    475  }
    476 
    477  for (uint8_t i = 0; i < remaining; i++) {
    478    const Utf8Unit unit(*(*aIter)++);
    479 
    480    // Every non-leading code unit in properly encoded UTF-8 has its high
    481    // bit set and the next-highest bit unset.
    482    if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
    483      uint8_t unitsObserved = i + 1 + 1;
    484      *aIter -= unitsObserved;
    485      aOnBadTrailingUnit(unitsObserved);
    486      return Nothing();
    487    }
    488 
    489    // The code point being encoded is the concatenation of all the
    490    // unconstrained bits.
    491    n = (n << 6) | (unit.toUint8() & 0b0011'1111);
    492  }
    493 
    494  // UTF-16 surrogates and values outside the Unicode range are invalid.
    495  if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
    496    uint8_t unitsObserved = remaining + 1;
    497    *aIter -= unitsObserved;
    498    aOnBadCodePoint(n, unitsObserved);
    499    return Nothing();
    500  }
    501 
    502  // Overlong code points are also invalid.
    503  if (MOZ_UNLIKELY(n < min)) {
    504    uint8_t unitsObserved = remaining + 1;
    505    *aIter -= unitsObserved;
    506    aOnNotShortestForm(n, unitsObserved);
    507    return Nothing();
    508  }
    509 
    510  return Some(n);
    511 }
    512 
    513 /**
    514 * Identical to the above function, but not forced to be instantiated inline --
    515 * the compiler is permitted to common up separate invocations if it chooses.
    516 */
    517 template <typename Iter, typename EndIter, class OnBadLeadUnit,
    518          class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
    519          class OnNotShortestForm>
    520 inline Maybe<char32_t> DecodeOneUtf8CodePoint(
    521    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
    522    OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
    523    OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
    524    OnNotShortestForm aOnNotShortestForm) {
    525  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit,
    526                                      aOnNotEnoughUnits, aOnBadTrailingUnit,
    527                                      aOnBadCodePoint, aOnNotShortestForm);
    528 }
    529 
    530 /**
    531 * Like the always-inlined function above, but with no-op behavior from all
    532 * trailing if-invalid notifier functors.
    533 *
    534 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
    535 * of this function without the "Inline" suffix on the name.
    536 */
    537 template <typename Iter, typename EndIter>
    538 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
    539    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) {
    540  // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
    541  // a multi-unit code point.  It is passed no arguments: the caller already has
    542  // |aLeadUnit| on hand, so no need to provide it again.
    543  auto onBadLeadUnit = []() {};
    544 
    545  // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
    546  // point length, but there aren't enough units from |*aIter| to |aEnd| to
    547  // satisfy that length.  It is passed the number of code units actually
    548  // available (according to |aEnd - *aIter|) and the number of code units that
    549  // |aLeadUnit| indicates are needed.  Both numbers include the contribution
    550  // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
    551  // |aUnitsAvailable < aUnitsNeeded|.  As above, it also is not passed the lead
    552  // code unit.
    553  auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};
    554 
    555  // aOnBadTrailingUnit is called when one of the trailing code units implied by
    556  // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
    557  // trailing code units must satisfy.  It is passed the total count of units
    558  // observed (including |aLeadUnit|).  The bad trailing code unit will
    559  // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
    560  // called, and so |aUnitsObserved <= 4|.
    561  auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};
    562 
    563  // aOnBadCodePoint is called when a structurally-correct code point encoding
    564  // is found, but the *value* that is encoded is not a valid code point: either
    565  // because it exceeded the U+10FFFF Unicode maximum code point, or because it
    566  // was a UTF-16 surrogate.  It is passed the non-code point value and the
    567  // number of code units used to encode it.
    568  auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};
    569 
    570  // aOnNotShortestForm is called when structurally-correct encoding is found,
    571  // but the encoded value should have been encoded in fewer code units (e.g.
    572  // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
    573  // as 0b0000'0000).  It is passed the mis-encoded code point (which will be
    574  // valid and not a surrogate) and the count of code units that mis-encoded it.
    575  auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {
    576  };
    577 
    578  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit,
    579                                      onNotEnoughUnits, onBadTrailingUnit,
    580                                      onBadCodePoint, onNotShortestForm);
    581 }
    582 
    583 /**
    584 * Identical to the above function, but not forced to be instantiated inline --
    585 * the compiler/linker are allowed to common up separate invocations.
    586 */
    587 template <typename Iter, typename EndIter>
    588 inline Maybe<char32_t> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
    589                                              Iter* aIter,
    590                                              const EndIter& aEnd) {
    591  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
    592 }
    593 
    594 }  // namespace mozilla
    595 
    596 #endif /* mozilla_Utf8_h */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE