TestUtf8.cpp (26076B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #define MOZ_PRETEND_NO_JSRUST 1 8 9 #include "mozilla/Utf8.h" 10 11 #include "mozilla/Assertions.h" 12 #include "mozilla/EnumSet.h" 13 #include "mozilla/IntegerRange.h" 14 #include "mozilla/Span.h" 15 16 using mozilla::AsChars; 17 using mozilla::DecodeOneUtf8CodePoint; 18 using mozilla::EnumSet; 19 using mozilla::IntegerRange; 20 using mozilla::IsAscii; 21 using mozilla::IsUtf8; 22 using mozilla::Span; 23 using mozilla::Utf8Unit; 24 25 static void TestUtf8Unit() { 26 Utf8Unit c('A'); 27 MOZ_RELEASE_ASSERT(c.toChar() == 'A'); 28 MOZ_RELEASE_ASSERT(c == Utf8Unit('A')); 29 MOZ_RELEASE_ASSERT(c != Utf8Unit('B')); 30 MOZ_RELEASE_ASSERT(c.toUint8() == 0x41); 31 32 unsigned char asUnsigned = 'A'; 33 MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned); 34 MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned); 35 36 Utf8Unit first('@'); 37 Utf8Unit second('#'); 38 39 MOZ_RELEASE_ASSERT(first != second); 40 41 first = second; 42 MOZ_RELEASE_ASSERT(first == second); 43 } 44 45 template <typename Char> 46 struct ToUtf8Units { 47 public: 48 explicit ToUtf8Units(const Char* aStart, const Char* aEnd) 49 : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) { 50 MOZ_RELEASE_ASSERT(!IsAscii(aStart[0])); 51 } 52 53 const Utf8Unit lead; 54 const Char* iter; 55 const Char* const end; 56 }; 57 58 class AssertIfCalled { 59 public: 60 template <typename... Args> 61 void operator()(Args&&... aArgs) { 62 MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called"); 63 } 64 }; 65 66 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's 67 // a string literal or a more-generalized array, we require |aCharN| be 68 // null-terminated. 69 70 template <typename Char, size_t N> 71 static void ExpectValidCodePoint(const Char (&aCharN)[N], 72 char32_t aExpectedCodePoint) { 73 MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, 74 "array must be null-terminated for |aCharN + N - 1| to " 75 "compute the value of |aIter| as altered by " 76 "DecodeOneUtf8CodePoint"); 77 78 ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1); 79 auto simple = 80 DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); 81 MOZ_RELEASE_ASSERT(simple.isSome()); 82 MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint); 83 MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end); 84 85 ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1); 86 auto complex = DecodeOneUtf8CodePoint( 87 complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(), 88 AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled()); 89 MOZ_RELEASE_ASSERT(complex.isSome()); 90 MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint); 91 MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end); 92 } 93 94 enum class InvalidUtf8Reason { 95 BadLeadUnit, 96 NotEnoughUnits, 97 BadTrailingUnit, 98 BadCodePoint, 99 NotShortestForm, 100 }; 101 102 template <typename Char, size_t N> 103 static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N], 104 InvalidUtf8Reason aExpectedReason, 105 uint8_t aExpectedUnitsAvailable, 106 uint8_t aExpectedUnitsNeeded, 107 char32_t aExpectedBadCodePoint, 108 uint8_t aExpectedUnitsObserved) { 109 MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, 110 "array must be null-terminated for |aCharN + N - 1| to " 111 "compute the value of |aIter| as altered by " 112 "DecodeOneUtf8CodePoint"); 113 114 ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1); 115 auto simple = 116 DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); 117 MOZ_RELEASE_ASSERT(simple.isNothing()); 118 MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN); 119 120 EnumSet<InvalidUtf8Reason> reasons; 121 uint8_t unitsAvailable; 122 uint8_t unitsNeeded; 123 char32_t badCodePoint; 124 uint8_t unitsObserved; 125 126 struct OnNotShortestForm { 127 EnumSet<InvalidUtf8Reason>& reasons; 128 char32_t& badCodePoint; 129 uint8_t& unitsObserved; 130 131 void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) { 132 reasons += InvalidUtf8Reason::NotShortestForm; 133 badCodePoint = aBadCodePoint; 134 unitsObserved = aUnitsObserved; 135 } 136 }; 137 138 ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1); 139 auto complex = DecodeOneUtf8CodePoint( 140 complexUnit.lead, &complexUnit.iter, complexUnit.end, 141 [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; }, 142 [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable, 143 uint8_t aUnitsNeeded) { 144 reasons += InvalidUtf8Reason::NotEnoughUnits; 145 unitsAvailable = aUnitsAvailable; 146 unitsNeeded = aUnitsNeeded; 147 }, 148 [&reasons, &unitsObserved](uint8_t aUnitsObserved) { 149 reasons += InvalidUtf8Reason::BadTrailingUnit; 150 unitsObserved = aUnitsObserved; 151 }, 152 [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, 153 uint8_t aUnitsObserved) { 154 reasons += InvalidUtf8Reason::BadCodePoint; 155 badCodePoint = aBadCodePoint; 156 unitsObserved = aUnitsObserved; 157 }, 158 [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, 159 uint8_t aUnitsObserved) { 160 reasons += InvalidUtf8Reason::NotShortestForm; 161 badCodePoint = aBadCodePoint; 162 unitsObserved = aUnitsObserved; 163 }); 164 MOZ_RELEASE_ASSERT(complex.isNothing()); 165 MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN); 166 167 bool alreadyIterated = false; 168 for (InvalidUtf8Reason reason : reasons) { 169 MOZ_RELEASE_ASSERT(!alreadyIterated); 170 alreadyIterated = true; 171 172 switch (reason) { 173 case InvalidUtf8Reason::BadLeadUnit: 174 break; 175 176 case InvalidUtf8Reason::NotEnoughUnits: 177 MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable); 178 MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded); 179 break; 180 181 case InvalidUtf8Reason::BadTrailingUnit: 182 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); 183 break; 184 185 case InvalidUtf8Reason::BadCodePoint: 186 MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); 187 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); 188 break; 189 190 case InvalidUtf8Reason::NotShortestForm: 191 MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); 192 MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); 193 break; 194 } 195 } 196 } 197 198 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's 199 // a string literal or a more-generalized array, we require |aCharN| be 200 // null-terminated in all these functions. 201 202 template <typename Char, size_t N> 203 static void ExpectBadLeadUnit(const Char (&aCharN)[N]) { 204 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF, 205 0xFF, 0xFFFFFFFF, 0xFF); 206 } 207 208 template <typename Char, size_t N> 209 static void ExpectNotEnoughUnits(const Char (&aCharN)[N], 210 uint8_t aExpectedUnitsAvailable, 211 uint8_t aExpectedUnitsNeeded) { 212 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits, 213 aExpectedUnitsAvailable, aExpectedUnitsNeeded, 214 0xFFFFFFFF, 0xFF); 215 } 216 217 template <typename Char, size_t N> 218 static void ExpectBadTrailingUnit(const Char (&aCharN)[N], 219 uint8_t aExpectedUnitsObserved) { 220 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF, 221 0xFF, 0xFFFFFFFF, aExpectedUnitsObserved); 222 } 223 224 template <typename Char, size_t N> 225 static void ExpectNotShortestForm(const Char (&aCharN)[N], 226 char32_t aExpectedBadCodePoint, 227 uint8_t aExpectedUnitsObserved) { 228 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF, 229 0xFF, aExpectedBadCodePoint, 230 aExpectedUnitsObserved); 231 } 232 233 template <typename Char, size_t N> 234 static void ExpectBadCodePoint(const Char (&aCharN)[N], 235 char32_t aExpectedBadCodePoint, 236 uint8_t aExpectedUnitsObserved) { 237 ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF, 238 0xFF, aExpectedBadCodePoint, 239 aExpectedUnitsObserved); 240 } 241 242 static void TestIsUtf8() { 243 // Note we include the U+0000 NULL in this one -- and that's fine. 244 static const char asciiBytes[] = "How about a nice game of chess?"; 245 MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, std::size(asciiBytes)))); 246 247 static const char endNonAsciiBytes[] = "Life is like a 🌯"; 248 MOZ_RELEASE_ASSERT( 249 IsUtf8(Span(endNonAsciiBytes, std::size(endNonAsciiBytes) - 1))); 250 251 static const unsigned char badLeading[] = {0x80}; 252 MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(badLeading, std::size(badLeading))))); 253 254 // Byte-counts 255 256 // 1 257 static const char oneBytes[] = "A"; // U+0041 LATIN CAPITAL LETTER A 258 constexpr size_t oneBytesLen = std::size(oneBytes); 259 static_assert(oneBytesLen == 2, "U+0041 plus nul"); 260 MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen))); 261 262 // 2 263 static const char twoBytes[] = "؆"; // U+0606 ARABIC-INDIC CUBE ROOT 264 constexpr size_t twoBytesLen = std::size(twoBytes); 265 static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul"); 266 MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen))); 267 268 ExpectValidCodePoint(twoBytes, 0x0606); 269 270 // 3 271 static const char threeBytes[] = "᨞"; // U+1A1E BUGINESE PALLAWA 272 constexpr size_t threeBytesLen = std::size(threeBytes); 273 static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul"); 274 MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen))); 275 276 ExpectValidCodePoint(threeBytes, 0x1A1E); 277 278 // 4 279 static const char fourBytes[] = "🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06 280 constexpr size_t fourBytesLen = std::size(fourBytes); 281 static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul"); 282 MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen))); 283 284 ExpectValidCodePoint(fourBytes, 0x1F061); 285 286 // Max code point 287 static const char maxCodePoint[] = ""; // U+10FFFF 288 constexpr size_t maxCodePointLen = std::size(maxCodePoint); 289 static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul"); 290 MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen))); 291 292 ExpectValidCodePoint(maxCodePoint, 0x10FFFF); 293 294 // One past max code point 295 static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80, 296 0x0}; 297 constexpr size_t onePastMaxCodePointLen = std::size(onePastMaxCodePoint); 298 MOZ_RELEASE_ASSERT( 299 !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen)))); 300 301 ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4); 302 303 // Surrogate-related testing 304 305 // (Note that the various code unit sequences here are null-terminated to 306 // simplify life for ExpectValidCodePoint, which presumes null termination.) 307 308 static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0}; 309 constexpr size_t justBeforeSurrogatesLen = 310 std::size(justBeforeSurrogates) - 1; 311 MOZ_RELEASE_ASSERT( 312 IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen)))); 313 314 ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF); 315 316 static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0}; 317 constexpr size_t leastSurrogateLen = std::size(leastSurrogate) - 1; 318 MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen)))); 319 320 ExpectBadCodePoint(leastSurrogate, 0xD800, 3); 321 322 static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0}; 323 constexpr size_t arbitraryHighSurrogateLen = 324 std::size(arbitraryHighSurrogate) - 1; 325 MOZ_RELEASE_ASSERT(!IsUtf8( 326 AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen)))); 327 328 ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3); 329 330 static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0}; 331 constexpr size_t arbitraryLowSurrogateLen = 332 std::size(arbitraryLowSurrogate) - 1; 333 MOZ_RELEASE_ASSERT( 334 !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen)))); 335 336 ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3); 337 338 static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0}; 339 constexpr size_t greatestSurrogateLen = std::size(greatestSurrogate) - 1; 340 MOZ_RELEASE_ASSERT( 341 !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen)))); 342 343 ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3); 344 345 static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0}; 346 constexpr size_t justAfterSurrogatesLen = std::size(justAfterSurrogates) - 1; 347 MOZ_RELEASE_ASSERT( 348 IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen)))); 349 350 ExpectValidCodePoint(justAfterSurrogates, 0xE000); 351 } 352 353 static void TestDecodeOneValidUtf8CodePoint() { 354 // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that 355 // consist of multiple code units, so there are no ASCII tests below. 356 357 // Length two. 358 359 ExpectValidCodePoint("", 0x80); // <control> 360 ExpectValidCodePoint("©", 0xA9); // COPYRIGHT SIGN 361 ExpectValidCodePoint("¶", 0xB6); // PILCROW SIGN 362 ExpectValidCodePoint("¾", 0xBE); // VULGAR FRACTION THREE QUARTERS 363 ExpectValidCodePoint("÷", 0xF7); // DIVISION SIGN 364 ExpectValidCodePoint("ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS 365 ExpectValidCodePoint("Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON 366 ExpectValidCodePoint("IJ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ 367 ExpectValidCodePoint("ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL 368 ExpectValidCodePoint("Ӝ", 369 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS 370 ExpectValidCodePoint("۩", 0x6E9); // ARABIC PLACE OF SAJDAH 371 ExpectValidCodePoint("߿", 0x7FF); // <not assigned> 372 373 // Length three. 374 375 ExpectValidCodePoint("ࠀ", 0x800); // SAMARITAN LETTER ALAF 376 ExpectValidCodePoint("ࡁ", 0x841); // MANDAIC LETTER AB 377 ExpectValidCodePoint("ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA 378 ExpectValidCodePoint("ஆ", 0xB86); // TAMIL LETTER AA 379 ExpectValidCodePoint("༃", 380 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA 381 ExpectValidCodePoint( 382 "࿉", 383 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like 384 // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me) 385 ExpectValidCodePoint("ဪ", 0x102A); // MYANMAR LETTER AU 386 ExpectValidCodePoint("ᚏ", 0x168F); // OGHAM LETTER RUIS 387 ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR 388 ExpectValidCodePoint("\xE2\x80\xA9", 389 0x2029); // (the hated) PARAGRAPH SEPARATOR 390 ExpectValidCodePoint("☬", 0x262C); // ADI SHAKTI 391 ExpectValidCodePoint("㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE 392 ExpectValidCodePoint("㏖", 0x33D6); // SQUARE MOL 393 ExpectValidCodePoint("ꔄ", 0xA504); // VAI SYLLABLE WEEN 394 ExpectValidCodePoint("ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK 395 ExpectValidCodePoint("", 0xD7FF); // <not assigned> 396 ExpectValidCodePoint("", 0xE000); // <Private Use> 397 ExpectValidCodePoint("鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F 398 ExpectValidCodePoint( 399 "﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM 400 ExpectValidCodePoint("", 0xFFFF); // <not assigned> 401 402 // Length four. 403 ExpectValidCodePoint("𐀀", 0x10000); // LINEAR B SYLLABLE B008 A 404 ExpectValidCodePoint("𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058 405 ExpectValidCodePoint("𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI 406 ExpectValidCodePoint("💩", 0x1F4A9); // PILE OF POO 407 ExpectValidCodePoint("🔫", 0x1F52B); // PISTOL 408 ExpectValidCodePoint("🥌", 0x1F94C); // CURLING STONE 409 ExpectValidCodePoint("🥏", 0x1F94F); // FLYING DISC 410 ExpectValidCodePoint("𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346 411 ExpectValidCodePoint("𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A 412 ExpectValidCodePoint("", 0x417F6); // <not assigned> 413 ExpectValidCodePoint("", 0x7E836); // <not assigned> 414 ExpectValidCodePoint("", 0xFEF67); // <Plane 15 Private Use> 415 ExpectValidCodePoint("", 0x10FFFF); // 416 } 417 418 static void TestDecodeBadLeadUnit() { 419 // These tests are actually exhaustive. 420 421 unsigned char badLead[] = {'\0', '\0'}; 422 423 for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) { 424 badLead[0] = lead; 425 ExpectBadLeadUnit(badLead); 426 } 427 428 { 429 uint8_t lead = 0b1111'1000; 430 do { 431 badLead[0] = lead; 432 ExpectBadLeadUnit(badLead); 433 if (lead == 0b1111'1111) { 434 break; 435 } 436 437 lead++; 438 } while (true); 439 } 440 } 441 442 static void TestTooFewOrBadTrailingUnits() { 443 // Lead unit indicates a two-byte code point. 444 445 char truncatedTwo[] = {'\0', '\0'}; 446 char badTrailTwo[] = {'\0', '\0', '\0'}; 447 448 for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) { 449 truncatedTwo[0] = lead; 450 ExpectNotEnoughUnits(truncatedTwo, 1, 2); 451 452 badTrailTwo[0] = lead; 453 for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) { 454 badTrailTwo[1] = trail; 455 ExpectBadTrailingUnit(badTrailTwo, 2); 456 } 457 458 for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) { 459 badTrailTwo[1] = trail; 460 ExpectBadTrailingUnit(badTrailTwo, 2); 461 } 462 } 463 464 // Lead unit indicates a three-byte code point. 465 466 char truncatedThreeOne[] = {'\0', '\0'}; 467 char truncatedThreeTwo[] = {'\0', '\0', '\0'}; 468 unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'}; 469 470 for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) { 471 truncatedThreeOne[0] = lead; 472 ExpectNotEnoughUnits(truncatedThreeOne, 1, 3); 473 474 truncatedThreeTwo[0] = lead; 475 ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3); 476 477 badTrailThree[0] = lead; 478 badTrailThree[2] = 0b1011'1111; // make valid to test overreads 479 for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) { 480 badTrailThree[1] = mid; 481 ExpectBadTrailingUnit(badTrailThree, 2); 482 } 483 { 484 uint8_t mid = 0b1100'0000; 485 do { 486 badTrailThree[1] = mid; 487 ExpectBadTrailingUnit(badTrailThree, 2); 488 if (mid == 0b1111'1111) { 489 break; 490 } 491 492 mid++; 493 } while (true); 494 } 495 496 badTrailThree[1] = 0b1011'1111; 497 for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) { 498 badTrailThree[2] = last; 499 ExpectBadTrailingUnit(badTrailThree, 3); 500 } 501 { 502 uint8_t last = 0b1100'0000; 503 do { 504 badTrailThree[2] = last; 505 ExpectBadTrailingUnit(badTrailThree, 3); 506 if (last == 0b1111'1111) { 507 break; 508 } 509 510 last++; 511 } while (true); 512 } 513 } 514 515 // Lead unit indicates a four-byte code point. 516 517 char truncatedFourOne[] = {'\0', '\0'}; 518 char truncatedFourTwo[] = {'\0', '\0', '\0'}; 519 char truncatedFourThree[] = {'\0', '\0', '\0', '\0'}; 520 521 unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'}; 522 523 for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) { 524 truncatedFourOne[0] = lead; 525 ExpectNotEnoughUnits(truncatedFourOne, 1, 4); 526 527 truncatedFourTwo[0] = lead; 528 ExpectNotEnoughUnits(truncatedFourTwo, 2, 4); 529 530 truncatedFourThree[0] = lead; 531 ExpectNotEnoughUnits(truncatedFourThree, 3, 4); 532 533 badTrailFour[0] = lead; 534 badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads 535 for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) { 536 badTrailFour[1] = second; 537 ExpectBadTrailingUnit(badTrailFour, 2); 538 } 539 { 540 uint8_t second = 0b1100'0000; 541 do { 542 badTrailFour[1] = second; 543 ExpectBadTrailingUnit(badTrailFour, 2); 544 if (second == 0b1111'1111) { 545 break; 546 } 547 548 second++; 549 } while (true); 550 } 551 552 badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads 553 for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) { 554 badTrailFour[2] = third; 555 ExpectBadTrailingUnit(badTrailFour, 3); 556 } 557 { 558 uint8_t third = 0b1100'0000; 559 do { 560 badTrailFour[2] = third; 561 ExpectBadTrailingUnit(badTrailFour, 3); 562 if (third == 0b1111'1111) { 563 break; 564 } 565 566 third++; 567 } while (true); 568 } 569 570 badTrailFour[2] = 0b1011'1111; 571 for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) { 572 badTrailFour[3] = fourth; 573 ExpectBadTrailingUnit(badTrailFour, 4); 574 } 575 { 576 uint8_t fourth = 0b1100'0000; 577 do { 578 badTrailFour[3] = fourth; 579 ExpectBadTrailingUnit(badTrailFour, 4); 580 if (fourth == 0b1111'1111) { 581 break; 582 } 583 584 fourth++; 585 } while (true); 586 } 587 } 588 } 589 590 static void TestBadSurrogate() { 591 // These tests are actually exhaustive. 592 593 ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates 594 ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates 595 596 // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }. Last invalid 597 // surrogate encoding is { 0xED, 0xBF, 0xBF }. 598 599 char badSurrogate[] = {'\xED', '\0', '\0', '\0'}; 600 601 for (char32_t c = 0xD800; c < 0xE000; c++) { 602 badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6); 603 badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); 604 605 ExpectBadCodePoint(badSurrogate, c, 3); 606 } 607 } 608 609 static void TestBadTooBig() { 610 // These tests are actually exhaustive. 611 612 ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point 613 614 // Four-byte code points are 615 // 616 // 0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx 617 // 618 // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally 619 // representable limit (exclusive) is 2**21 - 1 == 2097152. 620 621 char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'}; 622 623 for (char32_t c = 0x11'0000; c < (1 << 21); c++) { 624 tooLargeCodePoint[0] = 625 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); 626 tooLargeCodePoint[1] = 627 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); 628 tooLargeCodePoint[2] = 629 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); 630 tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); 631 632 ExpectBadCodePoint(tooLargeCodePoint, c, 4); 633 } 634 } 635 636 static void TestBadCodePoint() { 637 TestBadSurrogate(); 638 TestBadTooBig(); 639 } 640 641 static void TestNotShortestForm() { 642 { 643 // One-byte in two-byte. 644 645 char oneInTwo[] = {'\0', '\0', '\0'}; 646 647 for (char32_t c = '\0'; c < 0x80; c++) { 648 oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6); 649 oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); 650 651 ExpectNotShortestForm(oneInTwo, c, 2); 652 } 653 654 // One-byte in three-byte. 655 656 char oneInThree[] = {'\0', '\0', '\0', '\0'}; 657 658 for (char32_t c = '\0'; c < 0x80; c++) { 659 oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); 660 oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); 661 oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); 662 663 ExpectNotShortestForm(oneInThree, c, 3); 664 } 665 666 // One-byte in four-byte. 667 668 char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'}; 669 670 for (char32_t c = '\0'; c < 0x80; c++) { 671 oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); 672 oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); 673 oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); 674 oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); 675 676 ExpectNotShortestForm(oneInFour, c, 4); 677 } 678 } 679 680 { 681 // Two-byte in three-byte. 682 683 char twoInThree[] = {'\0', '\0', '\0', '\0'}; 684 685 for (char32_t c = 0x80; c < 0x800; c++) { 686 twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); 687 twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); 688 twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); 689 690 ExpectNotShortestForm(twoInThree, c, 3); 691 } 692 693 // Two-byte in four-byte. 694 695 char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'}; 696 697 for (char32_t c = 0x80; c < 0x800; c++) { 698 twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); 699 twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); 700 twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); 701 twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); 702 703 ExpectNotShortestForm(twoInFour, c, 4); 704 } 705 } 706 707 { 708 // Three-byte in four-byte. 709 710 char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'}; 711 712 for (char32_t c = 0x800; c < 0x1'0000; c++) { 713 threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); 714 threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); 715 threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); 716 threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); 717 718 ExpectNotShortestForm(threeInFour, c, 4); 719 } 720 } 721 } 722 723 static void TestDecodeOneInvalidUtf8CodePoint() { 724 TestDecodeBadLeadUnit(); 725 TestTooFewOrBadTrailingUnits(); 726 TestBadCodePoint(); 727 TestNotShortestForm(); 728 } 729 730 static void TestDecodeOneUtf8CodePoint() { 731 TestDecodeOneValidUtf8CodePoint(); 732 TestDecodeOneInvalidUtf8CodePoint(); 733 } 734 735 int main() { 736 TestUtf8Unit(); 737 TestIsUtf8(); 738 TestDecodeOneUtf8CodePoint(); 739 return 0; 740 }