[ tor-browser ].git.dasho

TestUtf8.cpp (26076B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #define MOZ_PRETEND_NO_JSRUST 1
      8 
      9 #include "mozilla/Utf8.h"
     10 
     11 #include "mozilla/Assertions.h"
     12 #include "mozilla/EnumSet.h"
     13 #include "mozilla/IntegerRange.h"
     14 #include "mozilla/Span.h"
     15 
     16 using mozilla::AsChars;
     17 using mozilla::DecodeOneUtf8CodePoint;
     18 using mozilla::EnumSet;
     19 using mozilla::IntegerRange;
     20 using mozilla::IsAscii;
     21 using mozilla::IsUtf8;
     22 using mozilla::Span;
     23 using mozilla::Utf8Unit;
     24 
     25 static void TestUtf8Unit() {
     26  Utf8Unit c('A');
     27  MOZ_RELEASE_ASSERT(c.toChar() == 'A');
     28  MOZ_RELEASE_ASSERT(c == Utf8Unit('A'));
     29  MOZ_RELEASE_ASSERT(c != Utf8Unit('B'));
     30  MOZ_RELEASE_ASSERT(c.toUint8() == 0x41);
     31 
     32  unsigned char asUnsigned = 'A';
     33  MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned);
     34  MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned);
     35 
     36  Utf8Unit first('@');
     37  Utf8Unit second('#');
     38 
     39  MOZ_RELEASE_ASSERT(first != second);
     40 
     41  first = second;
     42  MOZ_RELEASE_ASSERT(first == second);
     43 }
     44 
     45 template <typename Char>
     46 struct ToUtf8Units {
     47 public:
     48  explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
     49      : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) {
     50    MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
     51  }
     52 
     53  const Utf8Unit lead;
     54  const Char* iter;
     55  const Char* const end;
     56 };
     57 
     58 class AssertIfCalled {
     59 public:
     60  template <typename... Args>
     61  void operator()(Args&&... aArgs) {
     62    MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
     63  }
     64 };
     65 
     66 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
     67 //       a string literal or a more-generalized array, we require |aCharN| be
     68 //       null-terminated.
     69 
     70 template <typename Char, size_t N>
     71 static void ExpectValidCodePoint(const Char (&aCharN)[N],
     72                                 char32_t aExpectedCodePoint) {
     73  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
     74                     "array must be null-terminated for |aCharN + N - 1| to "
     75                     "compute the value of |aIter| as altered by "
     76                     "DecodeOneUtf8CodePoint");
     77 
     78  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
     79  auto simple =
     80      DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
     81  MOZ_RELEASE_ASSERT(simple.isSome());
     82  MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
     83  MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);
     84 
     85  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
     86  auto complex = DecodeOneUtf8CodePoint(
     87      complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(),
     88      AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled());
     89  MOZ_RELEASE_ASSERT(complex.isSome());
     90  MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
     91  MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
     92 }
     93 
     94 enum class InvalidUtf8Reason {
     95  BadLeadUnit,
     96  NotEnoughUnits,
     97  BadTrailingUnit,
     98  BadCodePoint,
     99  NotShortestForm,
    100 };
    101 
    102 template <typename Char, size_t N>
    103 static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
    104                                         InvalidUtf8Reason aExpectedReason,
    105                                         uint8_t aExpectedUnitsAvailable,
    106                                         uint8_t aExpectedUnitsNeeded,
    107                                         char32_t aExpectedBadCodePoint,
    108                                         uint8_t aExpectedUnitsObserved) {
    109  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
    110                     "array must be null-terminated for |aCharN + N - 1| to "
    111                     "compute the value of |aIter| as altered by "
    112                     "DecodeOneUtf8CodePoint");
    113 
    114  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
    115  auto simple =
    116      DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
    117  MOZ_RELEASE_ASSERT(simple.isNothing());
    118  MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);
    119 
    120  EnumSet<InvalidUtf8Reason> reasons;
    121  uint8_t unitsAvailable;
    122  uint8_t unitsNeeded;
    123  char32_t badCodePoint;
    124  uint8_t unitsObserved;
    125 
    126  struct OnNotShortestForm {
    127    EnumSet<InvalidUtf8Reason>& reasons;
    128    char32_t& badCodePoint;
    129    uint8_t& unitsObserved;
    130 
    131    void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
    132      reasons += InvalidUtf8Reason::NotShortestForm;
    133      badCodePoint = aBadCodePoint;
    134      unitsObserved = aUnitsObserved;
    135    }
    136  };
    137 
    138  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
    139  auto complex = DecodeOneUtf8CodePoint(
    140      complexUnit.lead, &complexUnit.iter, complexUnit.end,
    141      [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; },
    142      [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
    143                                                uint8_t aUnitsNeeded) {
    144        reasons += InvalidUtf8Reason::NotEnoughUnits;
    145        unitsAvailable = aUnitsAvailable;
    146        unitsNeeded = aUnitsNeeded;
    147      },
    148      [&reasons, &unitsObserved](uint8_t aUnitsObserved) {
    149        reasons += InvalidUtf8Reason::BadTrailingUnit;
    150        unitsObserved = aUnitsObserved;
    151      },
    152      [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
    153                                                uint8_t aUnitsObserved) {
    154        reasons += InvalidUtf8Reason::BadCodePoint;
    155        badCodePoint = aBadCodePoint;
    156        unitsObserved = aUnitsObserved;
    157      },
    158      [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
    159                                                uint8_t aUnitsObserved) {
    160        reasons += InvalidUtf8Reason::NotShortestForm;
    161        badCodePoint = aBadCodePoint;
    162        unitsObserved = aUnitsObserved;
    163      });
    164  MOZ_RELEASE_ASSERT(complex.isNothing());
    165  MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);
    166 
    167  bool alreadyIterated = false;
    168  for (InvalidUtf8Reason reason : reasons) {
    169    MOZ_RELEASE_ASSERT(!alreadyIterated);
    170    alreadyIterated = true;
    171 
    172    switch (reason) {
    173      case InvalidUtf8Reason::BadLeadUnit:
    174        break;
    175 
    176      case InvalidUtf8Reason::NotEnoughUnits:
    177        MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
    178        MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
    179        break;
    180 
    181      case InvalidUtf8Reason::BadTrailingUnit:
    182        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
    183        break;
    184 
    185      case InvalidUtf8Reason::BadCodePoint:
    186        MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
    187        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
    188        break;
    189 
    190      case InvalidUtf8Reason::NotShortestForm:
    191        MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
    192        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
    193        break;
    194    }
    195  }
    196 }
    197 
    198 // NOTE: For simplicity in treating |aCharN| identically regardless whether it's
    199 //       a string literal or a more-generalized array, we require |aCharN| be
    200 //       null-terminated in all these functions.
    201 
    202 template <typename Char, size_t N>
    203 static void ExpectBadLeadUnit(const Char (&aCharN)[N]) {
    204  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF,
    205                               0xFF, 0xFFFFFFFF, 0xFF);
    206 }
    207 
    208 template <typename Char, size_t N>
    209 static void ExpectNotEnoughUnits(const Char (&aCharN)[N],
    210                                 uint8_t aExpectedUnitsAvailable,
    211                                 uint8_t aExpectedUnitsNeeded) {
    212  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits,
    213                               aExpectedUnitsAvailable, aExpectedUnitsNeeded,
    214                               0xFFFFFFFF, 0xFF);
    215 }
    216 
    217 template <typename Char, size_t N>
    218 static void ExpectBadTrailingUnit(const Char (&aCharN)[N],
    219                                  uint8_t aExpectedUnitsObserved) {
    220  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF,
    221                               0xFF, 0xFFFFFFFF, aExpectedUnitsObserved);
    222 }
    223 
    224 template <typename Char, size_t N>
    225 static void ExpectNotShortestForm(const Char (&aCharN)[N],
    226                                  char32_t aExpectedBadCodePoint,
    227                                  uint8_t aExpectedUnitsObserved) {
    228  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF,
    229                               0xFF, aExpectedBadCodePoint,
    230                               aExpectedUnitsObserved);
    231 }
    232 
    233 template <typename Char, size_t N>
    234 static void ExpectBadCodePoint(const Char (&aCharN)[N],
    235                               char32_t aExpectedBadCodePoint,
    236                               uint8_t aExpectedUnitsObserved) {
    237  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF,
    238                               0xFF, aExpectedBadCodePoint,
    239                               aExpectedUnitsObserved);
    240 }
    241 
    242 static void TestIsUtf8() {
    243  // Note we include the U+0000 NULL in this one -- and that's fine.
    244  static const char asciiBytes[] = "How about a nice game of chess?";
    245  MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, std::size(asciiBytes))));
    246 
    247  static const char endNonAsciiBytes[] = "Life is like a 🌯";
    248  MOZ_RELEASE_ASSERT(
    249      IsUtf8(Span(endNonAsciiBytes, std::size(endNonAsciiBytes) - 1)));
    250 
    251  static const unsigned char badLeading[] = {0x80};
    252  MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(badLeading, std::size(badLeading)))));
    253 
    254  // Byte-counts
    255 
    256  // 1
    257  static const char oneBytes[] = "A";  // U+0041 LATIN CAPITAL LETTER A
    258  constexpr size_t oneBytesLen = std::size(oneBytes);
    259  static_assert(oneBytesLen == 2, "U+0041 plus nul");
    260  MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen)));
    261 
    262  // 2
    263  static const char twoBytes[] = "؆";  // U+0606 ARABIC-INDIC CUBE ROOT
    264  constexpr size_t twoBytesLen = std::size(twoBytes);
    265  static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
    266  MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen)));
    267 
    268  ExpectValidCodePoint(twoBytes, 0x0606);
    269 
    270  // 3
    271  static const char threeBytes[] = "᨞";  // U+1A1E BUGINESE PALLAWA
    272  constexpr size_t threeBytesLen = std::size(threeBytes);
    273  static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
    274  MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen)));
    275 
    276  ExpectValidCodePoint(threeBytes, 0x1A1E);
    277 
    278  // 4
    279  static const char fourBytes[] = "🁡";  // U+1F061 DOMINO TILE HORIZONTAL-06-06
    280  constexpr size_t fourBytesLen = std::size(fourBytes);
    281  static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
    282  MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen)));
    283 
    284  ExpectValidCodePoint(fourBytes, 0x1F061);
    285 
    286  // Max code point
    287  static const char maxCodePoint[] = "􏿿";  // U+10FFFF
    288  constexpr size_t maxCodePointLen = std::size(maxCodePoint);
    289  static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
    290  MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen)));
    291 
    292  ExpectValidCodePoint(maxCodePoint, 0x10FFFF);
    293 
    294  // One past max code point
    295  static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80,
    296                                                      0x0};
    297  constexpr size_t onePastMaxCodePointLen = std::size(onePastMaxCodePoint);
    298  MOZ_RELEASE_ASSERT(
    299      !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen))));
    300 
    301  ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);
    302 
    303  // Surrogate-related testing
    304 
    305  // (Note that the various code unit sequences here are null-terminated to
    306  // simplify life for ExpectValidCodePoint, which presumes null termination.)
    307 
    308  static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0};
    309  constexpr size_t justBeforeSurrogatesLen =
    310      std::size(justBeforeSurrogates) - 1;
    311  MOZ_RELEASE_ASSERT(
    312      IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen))));
    313 
    314  ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);
    315 
    316  static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0};
    317  constexpr size_t leastSurrogateLen = std::size(leastSurrogate) - 1;
    318  MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen))));
    319 
    320  ExpectBadCodePoint(leastSurrogate, 0xD800, 3);
    321 
    322  static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0};
    323  constexpr size_t arbitraryHighSurrogateLen =
    324      std::size(arbitraryHighSurrogate) - 1;
    325  MOZ_RELEASE_ASSERT(!IsUtf8(
    326      AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen))));
    327 
    328  ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);
    329 
    330  static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0};
    331  constexpr size_t arbitraryLowSurrogateLen =
    332      std::size(arbitraryLowSurrogate) - 1;
    333  MOZ_RELEASE_ASSERT(
    334      !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen))));
    335 
    336  ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);
    337 
    338  static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0};
    339  constexpr size_t greatestSurrogateLen = std::size(greatestSurrogate) - 1;
    340  MOZ_RELEASE_ASSERT(
    341      !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen))));
    342 
    343  ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);
    344 
    345  static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0};
    346  constexpr size_t justAfterSurrogatesLen = std::size(justAfterSurrogates) - 1;
    347  MOZ_RELEASE_ASSERT(
    348      IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen))));
    349 
    350  ExpectValidCodePoint(justAfterSurrogates, 0xE000);
    351 }
    352 
    353 static void TestDecodeOneValidUtf8CodePoint() {
    354  // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
    355  //       consist of multiple code units, so there are no ASCII tests below.
    356 
    357  // Length two.
    358 
    359  ExpectValidCodePoint("", 0x80);  // <control>
    360  ExpectValidCodePoint("©", 0xA9);   // COPYRIGHT SIGN
    361  ExpectValidCodePoint("¶", 0xB6);   // PILCROW SIGN
    362  ExpectValidCodePoint("¾", 0xBE);   // VULGAR FRACTION THREE QUARTERS
    363  ExpectValidCodePoint("÷", 0xF7);   // DIVISION SIGN
    364  ExpectValidCodePoint("ÿ", 0xFF);   // LATIN SMALL LETTER Y WITH DIAERESIS
    365  ExpectValidCodePoint("Ā", 0x100);  // LATIN CAPITAL LETTER A WITH MACRON
    366  ExpectValidCodePoint("Ĳ", 0x132);  // LATIN CAPITAL LETTER LIGATURE IJ
    367  ExpectValidCodePoint("ͼ", 0x37C);  // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
    368  ExpectValidCodePoint("Ӝ",
    369                       0x4DC);  // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
    370  ExpectValidCodePoint("۩", 0x6E9);  // ARABIC PLACE OF SAJDAH
    371  ExpectValidCodePoint("߿", 0x7FF);  // <not assigned>
    372 
    373  // Length three.
    374 
    375  ExpectValidCodePoint("ࠀ", 0x800);  // SAMARITAN LETTER ALAF
    376  ExpectValidCodePoint("ࡁ", 0x841);  // MANDAIC LETTER AB
    377  ExpectValidCodePoint("ࣿ", 0x8FF);   // ARABIC MARK SIDEWAYS NOON GHUNNA
    378  ExpectValidCodePoint("ஆ", 0xB86);  // TAMIL LETTER AA
    379  ExpectValidCodePoint("༃",
    380                       0xF03);  // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
    381  ExpectValidCodePoint(
    382      "࿉",
    383      0xFC9);  // TIBETAN SYMBOL NOR BU (but on my system it really looks like
    384               // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
    385  ExpectValidCodePoint("ဪ", 0x102A);             // MYANMAR LETTER AU
    386  ExpectValidCodePoint("ᚏ", 0x168F);             // OGHAM LETTER RUIS
    387  ExpectValidCodePoint("\xE2\x80\xA8", 0x2028);  // (the hated) LINE SEPARATOR
    388  ExpectValidCodePoint("\xE2\x80\xA9",
    389                       0x2029);         // (the hated) PARAGRAPH SEPARATOR
    390  ExpectValidCodePoint("☬", 0x262C);    // ADI SHAKTI
    391  ExpectValidCodePoint("㊮", 0x32AE);   // CIRCLED IDEOGRAPH RESOURCE
    392  ExpectValidCodePoint("㏖", 0x33D6);   // SQUARE MOL
    393  ExpectValidCodePoint("ꔄ", 0xA504);    // VAI SYLLABLE WEEN
    394  ExpectValidCodePoint("ퟕ", 0xD7D5);    // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
    395  ExpectValidCodePoint("퟿", 0xD7FF);  // <not assigned>
    396  ExpectValidCodePoint("", 0xE000);  // <Private Use>
    397  ExpectValidCodePoint("鱗", 0xF9F2);   // CJK COMPATIBILITY IDEOGRAPH-F9F
    398  ExpectValidCodePoint(
    399      "﷽", 0xFDFD);  // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
    400  ExpectValidCodePoint("", 0xFFFF);  // <not assigned>
    401 
    402  // Length four.
    403  ExpectValidCodePoint("𐀀", 0x10000);      // LINEAR B SYLLABLE B008 A
    404  ExpectValidCodePoint("𔑀", 0x14440);      // ANATOLIAN HIEROGLYPH A058
    405  ExpectValidCodePoint("𝛗", 0x1D6D7);      // MATHEMATICAL BOLD SMALL PHI
    406  ExpectValidCodePoint("💩", 0x1F4A9);     // PILE OF POO
    407  ExpectValidCodePoint("🔫", 0x1F52B);     // PISTOL
    408  ExpectValidCodePoint("🥌", 0x1F94C);     // CURLING STONE
    409  ExpectValidCodePoint("🥏", 0x1F94F);     // FLYING DISC
    410  ExpectValidCodePoint("𠍆", 0x20346);     // CJK UNIFIED IDEOGRAPH-20346
    411  ExpectValidCodePoint("𡠺", 0x2183A);     // CJK UNIFIED IDEOGRAPH-2183A
    412  ExpectValidCodePoint("񁟶", 0x417F6);   // <not assigned>
    413  ExpectValidCodePoint("񾠶", 0x7E836);   // <not assigned>
    414  ExpectValidCodePoint("󾽧", 0xFEF67);   // <Plane 15 Private Use>
    415  ExpectValidCodePoint("􏿿", 0x10FFFF);  //
    416 }
    417 
    418 static void TestDecodeBadLeadUnit() {
    419  // These tests are actually exhaustive.
    420 
    421  unsigned char badLead[] = {'\0', '\0'};
    422 
    423  for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
    424    badLead[0] = lead;
    425    ExpectBadLeadUnit(badLead);
    426  }
    427 
    428  {
    429    uint8_t lead = 0b1111'1000;
    430    do {
    431      badLead[0] = lead;
    432      ExpectBadLeadUnit(badLead);
    433      if (lead == 0b1111'1111) {
    434        break;
    435      }
    436 
    437      lead++;
    438    } while (true);
    439  }
    440 }
    441 
    442 static void TestTooFewOrBadTrailingUnits() {
    443  // Lead unit indicates a two-byte code point.
    444 
    445  char truncatedTwo[] = {'\0', '\0'};
    446  char badTrailTwo[] = {'\0', '\0', '\0'};
    447 
    448  for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
    449    truncatedTwo[0] = lead;
    450    ExpectNotEnoughUnits(truncatedTwo, 1, 2);
    451 
    452    badTrailTwo[0] = lead;
    453    for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
    454      badTrailTwo[1] = trail;
    455      ExpectBadTrailingUnit(badTrailTwo, 2);
    456    }
    457 
    458    for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
    459      badTrailTwo[1] = trail;
    460      ExpectBadTrailingUnit(badTrailTwo, 2);
    461    }
    462  }
    463 
    464  // Lead unit indicates a three-byte code point.
    465 
    466  char truncatedThreeOne[] = {'\0', '\0'};
    467  char truncatedThreeTwo[] = {'\0', '\0', '\0'};
    468  unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'};
    469 
    470  for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
    471    truncatedThreeOne[0] = lead;
    472    ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);
    473 
    474    truncatedThreeTwo[0] = lead;
    475    ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);
    476 
    477    badTrailThree[0] = lead;
    478    badTrailThree[2] = 0b1011'1111;  // make valid to test overreads
    479    for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
    480      badTrailThree[1] = mid;
    481      ExpectBadTrailingUnit(badTrailThree, 2);
    482    }
    483    {
    484      uint8_t mid = 0b1100'0000;
    485      do {
    486        badTrailThree[1] = mid;
    487        ExpectBadTrailingUnit(badTrailThree, 2);
    488        if (mid == 0b1111'1111) {
    489          break;
    490        }
    491 
    492        mid++;
    493      } while (true);
    494    }
    495 
    496    badTrailThree[1] = 0b1011'1111;
    497    for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
    498      badTrailThree[2] = last;
    499      ExpectBadTrailingUnit(badTrailThree, 3);
    500    }
    501    {
    502      uint8_t last = 0b1100'0000;
    503      do {
    504        badTrailThree[2] = last;
    505        ExpectBadTrailingUnit(badTrailThree, 3);
    506        if (last == 0b1111'1111) {
    507          break;
    508        }
    509 
    510        last++;
    511      } while (true);
    512    }
    513  }
    514 
    515  // Lead unit indicates a four-byte code point.
    516 
    517  char truncatedFourOne[] = {'\0', '\0'};
    518  char truncatedFourTwo[] = {'\0', '\0', '\0'};
    519  char truncatedFourThree[] = {'\0', '\0', '\0', '\0'};
    520 
    521  unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'};
    522 
    523  for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
    524    truncatedFourOne[0] = lead;
    525    ExpectNotEnoughUnits(truncatedFourOne, 1, 4);
    526 
    527    truncatedFourTwo[0] = lead;
    528    ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);
    529 
    530    truncatedFourThree[0] = lead;
    531    ExpectNotEnoughUnits(truncatedFourThree, 3, 4);
    532 
    533    badTrailFour[0] = lead;
    534    badTrailFour[2] = badTrailFour[3] = 0b1011'1111;  // test for overreads
    535    for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
    536      badTrailFour[1] = second;
    537      ExpectBadTrailingUnit(badTrailFour, 2);
    538    }
    539    {
    540      uint8_t second = 0b1100'0000;
    541      do {
    542        badTrailFour[1] = second;
    543        ExpectBadTrailingUnit(badTrailFour, 2);
    544        if (second == 0b1111'1111) {
    545          break;
    546        }
    547 
    548        second++;
    549      } while (true);
    550    }
    551 
    552    badTrailFour[1] = badTrailFour[3] = 0b1011'1111;  // test for overreads
    553    for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
    554      badTrailFour[2] = third;
    555      ExpectBadTrailingUnit(badTrailFour, 3);
    556    }
    557    {
    558      uint8_t third = 0b1100'0000;
    559      do {
    560        badTrailFour[2] = third;
    561        ExpectBadTrailingUnit(badTrailFour, 3);
    562        if (third == 0b1111'1111) {
    563          break;
    564        }
    565 
    566        third++;
    567      } while (true);
    568    }
    569 
    570    badTrailFour[2] = 0b1011'1111;
    571    for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
    572      badTrailFour[3] = fourth;
    573      ExpectBadTrailingUnit(badTrailFour, 4);
    574    }
    575    {
    576      uint8_t fourth = 0b1100'0000;
    577      do {
    578        badTrailFour[3] = fourth;
    579        ExpectBadTrailingUnit(badTrailFour, 4);
    580        if (fourth == 0b1111'1111) {
    581          break;
    582        }
    583 
    584        fourth++;
    585      } while (true);
    586    }
    587  }
    588 }
    589 
    590 static void TestBadSurrogate() {
    591  // These tests are actually exhaustive.
    592 
    593  ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF);  // last before surrogates
    594  ExpectValidCodePoint("\xEE\x80\x80", 0xE000);  // first after surrogates
    595 
    596  // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }.  Last invalid
    597  // surrogate encoding is { 0xED, 0xBF, 0xBF }.
    598 
    599  char badSurrogate[] = {'\xED', '\0', '\0', '\0'};
    600 
    601  for (char32_t c = 0xD800; c < 0xE000; c++) {
    602    badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
    603    badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
    604 
    605    ExpectBadCodePoint(badSurrogate, c, 3);
    606  }
    607 }
    608 
    609 static void TestBadTooBig() {
    610  // These tests are actually exhaustive.
    611 
    612  ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF);  // last code point
    613 
    614  // Four-byte code points are
    615  //
    616  //   0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
    617  //
    618  // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
    619  // representable limit (exclusive) is 2**21 - 1 == 2097152.
    620 
    621  char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'};
    622 
    623  for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
    624    tooLargeCodePoint[0] =
    625        0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
    626    tooLargeCodePoint[1] =
    627        0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
    628    tooLargeCodePoint[2] =
    629        0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
    630    tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
    631 
    632    ExpectBadCodePoint(tooLargeCodePoint, c, 4);
    633  }
    634 }
    635 
    636 static void TestBadCodePoint() {
    637  TestBadSurrogate();
    638  TestBadTooBig();
    639 }
    640 
    641 static void TestNotShortestForm() {
    642  {
    643    // One-byte in two-byte.
    644 
    645    char oneInTwo[] = {'\0', '\0', '\0'};
    646 
    647    for (char32_t c = '\0'; c < 0x80; c++) {
    648      oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
    649      oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));
    650 
    651      ExpectNotShortestForm(oneInTwo, c, 2);
    652    }
    653 
    654    // One-byte in three-byte.
    655 
    656    char oneInThree[] = {'\0', '\0', '\0', '\0'};
    657 
    658    for (char32_t c = '\0'; c < 0x80; c++) {
    659      oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
    660      oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
    661      oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
    662 
    663      ExpectNotShortestForm(oneInThree, c, 3);
    664    }
    665 
    666    // One-byte in four-byte.
    667 
    668    char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'};
    669 
    670    for (char32_t c = '\0'; c < 0x80; c++) {
    671      oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
    672      oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
    673      oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
    674      oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
    675 
    676      ExpectNotShortestForm(oneInFour, c, 4);
    677    }
    678  }
    679 
    680  {
    681    // Two-byte in three-byte.
    682 
    683    char twoInThree[] = {'\0', '\0', '\0', '\0'};
    684 
    685    for (char32_t c = 0x80; c < 0x800; c++) {
    686      twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
    687      twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
    688      twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));
    689 
    690      ExpectNotShortestForm(twoInThree, c, 3);
    691    }
    692 
    693    // Two-byte in four-byte.
    694 
    695    char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'};
    696 
    697    for (char32_t c = 0x80; c < 0x800; c++) {
    698      twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
    699      twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
    700      twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
    701      twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
    702 
    703      ExpectNotShortestForm(twoInFour, c, 4);
    704    }
    705  }
    706 
    707  {
    708    // Three-byte in four-byte.
    709 
    710    char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'};
    711 
    712    for (char32_t c = 0x800; c < 0x1'0000; c++) {
    713      threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
    714      threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
    715      threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
    716      threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));
    717 
    718      ExpectNotShortestForm(threeInFour, c, 4);
    719    }
    720  }
    721 }
    722 
    723 static void TestDecodeOneInvalidUtf8CodePoint() {
    724  TestDecodeBadLeadUnit();
    725  TestTooFewOrBadTrailingUnits();
    726  TestBadCodePoint();
    727  TestNotShortestForm();
    728 }
    729 
    730 static void TestDecodeOneUtf8CodePoint() {
    731  TestDecodeOneValidUtf8CodePoint();
    732  TestDecodeOneInvalidUtf8CodePoint();
    733 }
    734 
    735 int main() {
    736  TestUtf8Unit();
    737  TestIsUtf8();
    738  TestDecodeOneUtf8CodePoint();
    739  return 0;
    740 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE