Utf8.h (25263B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* 8 * UTF-8-related functionality, including a type-safe structure representing a 9 * UTF-8 code unit. 10 */ 11 12 #ifndef mozilla_Utf8_h 13 #define mozilla_Utf8_h 14 15 #include "mozilla/Casting.h" // for mozilla::AssertedCast 16 #include "mozilla/Likely.h" // for MOZ_UNLIKELY 17 #include "mozilla/Maybe.h" // for mozilla::Maybe 18 #include "mozilla/Span.h" // for mozilla::Span 19 #include "mozilla/TextUtils.h" // for mozilla::IsAscii and via Latin1.h for 20 // encoding_rs_mem.h and MOZ_HAS_JSRUST. 21 #include "mozilla/Types.h" // for MFBT_API 22 23 #include <limits> // for std::numeric_limits 24 #include <limits.h> // for CHAR_BIT 25 #include <stddef.h> // for size_t 26 #include <stdint.h> // for uint8_t 27 28 #if MOZ_HAS_JSRUST() 29 // Can't include mozilla/Encoding.h here. 30 extern "C" { 31 // Declared as uint8_t instead of char to match declaration in another header. 32 size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); 33 } 34 #else 35 namespace mozilla { 36 namespace detail { 37 extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount); 38 }; // namespace detail 39 }; // namespace mozilla 40 #endif // MOZ_HAS_JSRUST 41 42 namespace mozilla { 43 44 union Utf8Unit; 45 46 static_assert(CHAR_BIT == 8, 47 "Utf8Unit won't work so well with non-octet chars"); 48 49 /** 50 * A code unit within a UTF-8 encoded string. (A code unit is the smallest 51 * unit within the Unicode encoding of a string. For UTF-8 this is an 8-bit 52 * number; for UTF-16 it would be a 16-bit number.) 53 * 54 * This is *not* the same as a single code point: in UTF-8, non-ASCII code 55 * points are constituted by multiple code units. 56 */ 57 union Utf8Unit { 58 private: 59 // Utf8Unit is a union wrapping a raw |char|. The C++ object model and C++ 60 // requirements as to how objects may be accessed with respect to their actual 61 // types (almost?) uniquely compel this choice. 62 // 63 // Our requirements for a UTF-8 code unit representation are: 64 // 65 // 1. It must be "compatible" with C++ character/string literals that use 66 // the UTF-8 encoding. Given a properly encoded C++ literal, you should 67 // be able to use |Utf8Unit| and friends to access it; given |Utf8Unit| 68 // and friends (particularly UnicodeData), you should be able to access 69 // C++ character types for their contents. 70 // 2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by 71 // explicit operation. 72 // 3. |Utf8Unit| must participate in overload resolution and template type 73 // equivalence (that is, given |template<class> class X|, when |X<T>| and 74 // |X<U>| are the same type) distinctly from the C++ character types. 75 // 76 // And a few nice-to-haves (at least for the moment): 77 // 78 // 4. The representation should use unsigned numbers, to avoid undefined 79 // behavior that can arise with signed types, and because Unicode code 80 // points and code units are unsigned. 81 // 5. |Utf8Unit| and friends should be convertible to/from |unsigned char| 82 // and |unsigned char*|, for APIs that (because of #4 above) use those 83 // types as the "natural" choice for UTF-8 data. 84 // 85 // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of 86 // |{,{un,}signed} char|.[0] |uint8_t| won't work because it might not be a 87 // C++ character type. 88 // 89 // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one: 90 // typedefs don't generate *new* types, just type aliases). This requires a 91 // compound type. 92 // 93 // The ultimate representation (and character type in it) is constrained by 94 // C++14 [basic.lval]p10 that defines how objects may be accessed, with 95 // respect to the dynamic type in memory and the actual type used to access 96 // them. It reads: 97 // 98 // If a program attempts to access the stored value of an object 99 // through a glvalue of other than one of the following types the 100 // behavior is undefined: 101 // 102 // 1. the dynamic type of the object, 103 // 2. a cv-qualified version of the dynamic type of the object, 104 // ...other types irrelevant here... 105 // 3. an aggregate or union type that includes one of the 106 // aforementioned types among its elements or non-static data 107 // members (including, recursively, an element or non-static 108 // data member of a subaggregate or contained union), 109 // ...more irrelevant types... 110 // 4. a char or unsigned char type. 111 // 112 // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no 113 // matter the representation by #4. (Briefly set aside what values are seen.) 114 // (And #2 allows |const| on either the dynamic type or the accessing type.) 115 // (|signed char| is really only useful for small signed numbers, not 116 // characters, so we ignore it.) 117 // 118 // If we interpret contents as |char|/|unsigned char| contrary to the actual 119 // type stored there, what happens? C++14 [basic.fundamental]p1 requires 120 // character types be identically aligned/sized; C++14 [basic.fundamental]p3 121 // requires |signed char| and |unsigned char| have the same value 122 // representation. C++ doesn't require identical bitwise representation, tho. 123 // Practically we could assume it, but this verges on C++ spec bits best not 124 // *relied* on for correctness, if possible. 125 // 126 // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char| 127 // and |char*|. Instead we safely expose |unsigned char| by fully-defined 128 // *integral conversion* (C++14 [conv.integral]p2). Integral conversion from 129 // |unsigned char| → |char| has only implementation-defined behavior. It'd be 130 // better not to depend on that, but given twos-complement won, it should be 131 // okay. (Also |unsigned char*| is awkward enough to work with for strings 132 // that it probably doesn't appear in string manipulation much anyway, only in 133 // places that should really use |Utf8Unit| directly.) 134 // 135 // The opposite direction -- interpreting |char| or |char*| data through 136 // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as 137 // decided above, using #3. An "aggregate or union" will work that contains a 138 // |char|. Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says 139 // aggregates must have "no private or protected non-static data members", and 140 // we want to keep the inner |char| hidden. So a |struct| is out, and only 141 // |union| remains. 142 // 143 // (Enums are not "an aggregate or union type", so [maybe surprisingly] we 144 // can't make |Utf8Unit| an enum class with |char| underlying type, because we 145 // are given no license to treat |char| memory as such an |enum|'s memory.) 146 // 147 // Therefore |Utf8Unit| is a union type with a |char| non-static data member. 148 // This satisfies all our requirements. It also supports the nice-to-haves of 149 // creating a |Utf8Unit| from an |unsigned char|, and being convertible to 150 // |unsigned char|. It doesn't satisfy the nice-to-haves of using an 151 // |unsigned char| internally, nor of letting us wrap an existing 152 // |unsigned char| or pointer to one. We probably *could* do these, if we 153 // were willing to rely harder on implementation-defined behaviors, but for 154 // now we privilege C++'s main character type over some conceptual purity. 155 // 156 // 0. There's a proposal for a UTF-8 character type distinct from the existing 157 // C++ narrow character types: 158 // 159 // http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html 160 // 161 // but it hasn't been standardized (and might never be), and none of the 162 // compilers we really care about have implemented it. Maybe someday we 163 // can change our implementation to it without too much trouble, if we're 164 // lucky... 165 char mValue = '\0'; 166 167 public: 168 Utf8Unit() = default; 169 170 explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {} 171 172 explicit constexpr Utf8Unit(unsigned char aUnit) 173 : mValue(static_cast<char>(aUnit)) { 174 // Per the above comment, the prior cast is integral conversion with 175 // implementation-defined semantics, and we regretfully but unavoidably 176 // assume the conversion does what we want it to. 177 } 178 179 #ifdef __cpp_char8_t 180 explicit constexpr Utf8Unit(char8_t aUnit) 181 : mValue(static_cast<char>(aUnit)) {} 182 #endif 183 184 constexpr bool operator==(const Utf8Unit& aOther) const { 185 return mValue == aOther.mValue; 186 } 187 188 constexpr bool operator!=(const Utf8Unit& aOther) const { 189 return !(*this == aOther); 190 } 191 192 /** Convert a UTF-8 code unit to a raw char. */ 193 constexpr char toChar() const { 194 // Only a |char| is ever permitted to be written into this location, so this 195 // is both permissible and returns the desired value. 196 return mValue; 197 } 198 199 /** Convert a UTF-8 code unit to a raw unsigned char. */ 200 constexpr unsigned char toUnsignedChar() const { 201 // Per the above comment, this is well-defined integral conversion. 202 return static_cast<unsigned char>(mValue); 203 } 204 205 /** Convert a UTF-8 code unit to a uint8_t. */ 206 constexpr uint8_t toUint8() const { 207 // Per the above comment, this is well-defined integral conversion. 208 return static_cast<uint8_t>(mValue); 209 } 210 211 // We currently don't expose |&mValue|. |UnicodeData| sort of does, but 212 // that's a somewhat separate concern, justified in different comments in 213 // that other code. 214 }; 215 216 /** 217 * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|. 218 * 219 * Assuming proper backing has been set up, the resulting |const unsigned char*| 220 * may validly be dereferenced. 221 * 222 * No access is provided to mutate this underlying memory as |unsigned char|. 223 * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are 224 * loath to offer a way to write non-|char| data until absolutely necessary. 225 */ 226 inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) { 227 static_assert(sizeof(Utf8Unit) == sizeof(unsigned char), 228 "sizes must match to permissibly reinterpret_cast<>"); 229 static_assert(alignof(Utf8Unit) == alignof(unsigned char), 230 "alignment must match to permissibly reinterpret_cast<>"); 231 232 // The static_asserts above only enable the reinterpret_cast<> to occur. 233 // 234 // Dereferencing the resulting pointer is a separate question. Any object's 235 // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but 236 // this doesn't guarantee what values will be observed. If |char| is 237 // implemented to act like |unsigned char|, we're good to go: memory for the 238 // |char| in |Utf8Unit| acts as we need. But if |char| is implemented to act 239 // like |signed char|, dereferencing produces the right value only if the 240 // |char| types all use two's-complement representation. Every modern 241 // compiler does this, and there's a C++ proposal to standardize it. 242 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html So 243 // *technically* this is implementation-defined -- but everyone does it and 244 // this behavior is being standardized. 245 return reinterpret_cast<const unsigned char*>(aUnits); 246 } 247 248 /** Returns true iff |aUnit| is an ASCII value. */ 249 constexpr bool IsAscii(Utf8Unit aUnit) { 250 return IsAscii(aUnit.toUnsignedChar()); 251 } 252 253 /** 254 * Return true if the given span of memory consists of a valid UTF-8 255 * string and false otherwise. 256 * 257 * The string *may* contain U+0000 NULL code points. 258 */ 259 inline bool IsUtf8(mozilla::Span<const char> aString) { 260 #if MOZ_HAS_JSRUST() 261 size_t length = aString.Length(); 262 const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements()); 263 // For short strings, the function call is a pessimization, and the SIMD 264 // code won't have a chance to kick in anyway. 265 if (length < 16) { 266 for (size_t i = 0; i < length; i++) { 267 if (ptr[i] >= 0x80U) { 268 ptr += i; 269 length -= i; 270 goto end; 271 } 272 } 273 return true; 274 } 275 end: 276 return length == encoding_utf8_valid_up_to(ptr, length); 277 #else 278 return detail::IsValidUtf8(aString.Elements(), aString.Length()); 279 #endif 280 } 281 282 #if MOZ_HAS_JSRUST() 283 284 // See Latin1.h for conversions between Latin1 and UTF-8. 285 286 /** 287 * Returns the index of the start of the first malformed byte 288 * sequence or the length of the string if there are none. 289 */ 290 inline size_t Utf8ValidUpTo(mozilla::Span<const char> aString) { 291 return encoding_utf8_valid_up_to( 292 reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length()); 293 } 294 295 /** 296 * Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates 297 * with the REPLACEMENT CHARACTER. 298 * 299 * The length of aDest must be at least the length of aSource times three. 300 * 301 * Returns the number of code units written. 302 */ 303 inline size_t ConvertUtf16toUtf8(mozilla::Span<const char16_t> aSource, 304 mozilla::Span<char> aDest) { 305 return encoding_mem_convert_utf16_to_utf8( 306 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); 307 } 308 309 /** 310 * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte 311 * sequences with the REPLACEMENT CHARACTER with potentially insufficient 312 * output space. 313 * 314 * Returns the number of code units read and the number of bytes written. 315 * 316 * If the output isn't large enough, not all input is consumed. 317 * 318 * The conversion is guaranteed to be complete if the length of aDest is 319 * at least the length of aSource times three. 320 * 321 * The output is always valid UTF-8 ending on scalar value boundary 322 * even in the case of partial conversion. 323 * 324 * The semantics of this function match the semantics of 325 * TextEncoder.encodeInto. 326 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto 327 */ 328 inline std::tuple<size_t, size_t> ConvertUtf16toUtf8Partial( 329 mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest) { 330 size_t srcLen = aSource.Length(); 331 size_t dstLen = aDest.Length(); 332 encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen, 333 aDest.Elements(), &dstLen); 334 return std::make_tuple(srcLen, dstLen); 335 } 336 337 /** 338 * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte 339 * sequences with the REPLACEMENT CHARACTER. 340 * 341 * Returns the number of code units written. 342 * 343 * The length of aDest must be at least one greater than the length of aSource 344 * even though the last slot isn't written to. 345 * 346 * If you know that the input is valid for sure, use 347 * UnsafeConvertValidUtf8toUtf16() instead. 348 */ 349 inline size_t ConvertUtf8toUtf16(mozilla::Span<const char> aSource, 350 mozilla::Span<char16_t> aDest) { 351 return encoding_mem_convert_utf8_to_utf16( 352 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); 353 } 354 355 /** 356 * Converts known-valid UTF-8 to UTF-16. If the input might be invalid, 357 * use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead. 358 * 359 * Returns the number of code units written. 360 * 361 * The length of aDest must be at least the length of aSource. 362 */ 363 inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span<const char> aSource, 364 mozilla::Span<char16_t> aDest) { 365 return encoding_mem_convert_str_to_utf16(aSource.Elements(), aSource.Length(), 366 aDest.Elements(), aDest.Length()); 367 } 368 369 /** 370 * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. 371 * 372 * Returns the number of code units written or `mozilla::Nothing` if the 373 * input was invalid. 374 * 375 * The length of the destination buffer must be at least the length of the 376 * source buffer. 377 * 378 * When the input was invalid, some output may have been written. 379 * 380 * If you know that the input is valid for sure, use 381 * UnsafeConvertValidUtf8toUtf16() instead. 382 */ 383 inline mozilla::Maybe<size_t> ConvertUtf8toUtf16WithoutReplacement( 384 mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest) { 385 size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement( 386 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); 387 if (MOZ_UNLIKELY(written == std::numeric_limits<size_t>::max())) { 388 return mozilla::Nothing(); 389 } 390 return mozilla::Some(written); 391 } 392 393 #endif // MOZ_HAS_JSRUST 394 395 /** 396 * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern 397 * 0b10xx'xxxx. 398 */ 399 inline bool IsTrailingUnit(Utf8Unit aUnit) { 400 return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000; 401 } 402 403 /** 404 * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter| 405 * that (initially) itself points one unit past |aLeadUnit|, and 406 * |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared 407 * against |*aIter| using |aEnd - *aIter|: 408 * 409 * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to 410 * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a 411 * surrogate, in shortest form -- then return Some(that code point) and advance 412 * |*aIter| past those code units. 413 * 414 * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return 415 * Nothing(). 416 * 417 * |Iter| and |EndIter| are generalized concepts most easily understood as if 418 * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|: 419 * iterators that when dereferenced can be used to construct a |Utf8Unit| and 420 * that can be compared and modified in certain limited ways. (Carefully note 421 * that this function mutates |*aIter|.) |Iter| and |EndIter| are template 422 * parameters to support more-complicated adaptor iterators. 423 * 424 * The template parameters after |Iter| allow users to implement custom handling 425 * for various forms of invalid UTF-8. A version of this function that defaults 426 * all such handling to no-ops is defined below this function. To learn how to 427 * define your own custom handling, consult the implementation of that function, 428 * which documents exactly how custom handler functors are invoked. 429 * 430 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version 431 * of this function without the "Inline" suffix on the name. 432 */ 433 template <typename Iter, typename EndIter, class OnBadLeadUnit, 434 class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint, 435 class OnNotShortestForm> 436 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline( 437 const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd, 438 OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits, 439 OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint, 440 OnNotShortestForm aOnNotShortestForm) { 441 MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit); 442 443 char32_t n = aLeadUnit.toUint8(); 444 MOZ_ASSERT(!IsAscii(n)); 445 446 // |aLeadUnit| determines the number of trailing code units in the code point 447 // and the bits of |aLeadUnit| that contribute to the code point's value. 448 uint8_t remaining; 449 uint32_t min; 450 if ((n & 0b1110'0000) == 0b1100'0000) { 451 remaining = 1; 452 min = 0x80; 453 n &= 0b0001'1111; 454 } else if ((n & 0b1111'0000) == 0b1110'0000) { 455 remaining = 2; 456 min = 0x800; 457 n &= 0b0000'1111; 458 } else if ((n & 0b1111'1000) == 0b1111'0000) { 459 remaining = 3; 460 min = 0x10000; 461 n &= 0b0000'0111; 462 } else { 463 *aIter -= 1; 464 aOnBadLeadUnit(); 465 return Nothing(); 466 } 467 468 // If the code point would require more code units than remain, the encoding 469 // is invalid. 470 auto actual = aEnd - *aIter; 471 if (MOZ_UNLIKELY(actual < remaining)) { 472 *aIter -= 1; 473 aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1); 474 return Nothing(); 475 } 476 477 for (uint8_t i = 0; i < remaining; i++) { 478 const Utf8Unit unit(*(*aIter)++); 479 480 // Every non-leading code unit in properly encoded UTF-8 has its high 481 // bit set and the next-highest bit unset. 482 if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) { 483 uint8_t unitsObserved = i + 1 + 1; 484 *aIter -= unitsObserved; 485 aOnBadTrailingUnit(unitsObserved); 486 return Nothing(); 487 } 488 489 // The code point being encoded is the concatenation of all the 490 // unconstrained bits. 491 n = (n << 6) | (unit.toUint8() & 0b0011'1111); 492 } 493 494 // UTF-16 surrogates and values outside the Unicode range are invalid. 495 if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) { 496 uint8_t unitsObserved = remaining + 1; 497 *aIter -= unitsObserved; 498 aOnBadCodePoint(n, unitsObserved); 499 return Nothing(); 500 } 501 502 // Overlong code points are also invalid. 503 if (MOZ_UNLIKELY(n < min)) { 504 uint8_t unitsObserved = remaining + 1; 505 *aIter -= unitsObserved; 506 aOnNotShortestForm(n, unitsObserved); 507 return Nothing(); 508 } 509 510 return Some(n); 511 } 512 513 /** 514 * Identical to the above function, but not forced to be instantiated inline -- 515 * the compiler is permitted to common up separate invocations if it chooses. 516 */ 517 template <typename Iter, typename EndIter, class OnBadLeadUnit, 518 class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint, 519 class OnNotShortestForm> 520 inline Maybe<char32_t> DecodeOneUtf8CodePoint( 521 const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd, 522 OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits, 523 OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint, 524 OnNotShortestForm aOnNotShortestForm) { 525 return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit, 526 aOnNotEnoughUnits, aOnBadTrailingUnit, 527 aOnBadCodePoint, aOnNotShortestForm); 528 } 529 530 /** 531 * Like the always-inlined function above, but with no-op behavior from all 532 * trailing if-invalid notifier functors. 533 * 534 * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version 535 * of this function without the "Inline" suffix on the name. 536 */ 537 template <typename Iter, typename EndIter> 538 MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline( 539 const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) { 540 // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in 541 // a multi-unit code point. It is passed no arguments: the caller already has 542 // |aLeadUnit| on hand, so no need to provide it again. 543 auto onBadLeadUnit = []() {}; 544 545 // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code 546 // point length, but there aren't enough units from |*aIter| to |aEnd| to 547 // satisfy that length. It is passed the number of code units actually 548 // available (according to |aEnd - *aIter|) and the number of code units that 549 // |aLeadUnit| indicates are needed. Both numbers include the contribution 550 // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and 551 // |aUnitsAvailable < aUnitsNeeded|. As above, it also is not passed the lead 552 // code unit. 553 auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {}; 554 555 // aOnBadTrailingUnit is called when one of the trailing code units implied by 556 // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8 557 // trailing code units must satisfy. It is passed the total count of units 558 // observed (including |aLeadUnit|). The bad trailing code unit will 559 // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is 560 // called, and so |aUnitsObserved <= 4|. 561 auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {}; 562 563 // aOnBadCodePoint is called when a structurally-correct code point encoding 564 // is found, but the *value* that is encoded is not a valid code point: either 565 // because it exceeded the U+10FFFF Unicode maximum code point, or because it 566 // was a UTF-16 surrogate. It is passed the non-code point value and the 567 // number of code units used to encode it. 568 auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {}; 569 570 // aOnNotShortestForm is called when structurally-correct encoding is found, 571 // but the encoded value should have been encoded in fewer code units (e.g. 572 // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of 573 // as 0b0000'0000). It is passed the mis-encoded code point (which will be 574 // valid and not a surrogate) and the count of code units that mis-encoded it. 575 auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) { 576 }; 577 578 return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit, 579 onNotEnoughUnits, onBadTrailingUnit, 580 onBadCodePoint, onNotShortestForm); 581 } 582 583 /** 584 * Identical to the above function, but not forced to be instantiated inline -- 585 * the compiler/linker are allowed to common up separate invocations. 586 */ 587 template <typename Iter, typename EndIter> 588 inline Maybe<char32_t> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit, 589 Iter* aIter, 590 const EndIter& aEnd) { 591 return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd); 592 } 593 594 } // namespace mozilla 595 596 #endif /* mozilla_Utf8_h */