testAtomizeUtf8NonAsciiLatin1CodePoint.cpp (6475B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #include "mozilla/Maybe.h" // mozilla::Maybe 6 #include "mozilla/Utf8.h" // mozilla::IsTrailingUnit, mozilla::Utf8Unit, mozilla::DecodeOneUtf8CodePoint 7 8 #include <inttypes.h> // UINT8_MAX 9 #include <stdint.h> // uint16_t 10 11 #include "js/Exception.h" // JS_IsExceptionPending, JS_ClearPendingException 12 #include "js/RootingAPI.h" // JS::Rooted, JS::MutableHandle 13 #include "jsapi-tests/tests.h" // BEGIN_TEST, END_TEST, CHECK 14 #include "vm/JSAtomUtils.h" // js::AtomizeChars, js::AtomizeUTF8Chars 15 #include "vm/StringType.h" // JSAtom 16 17 using mozilla::DecodeOneUtf8CodePoint; 18 using mozilla::IsAscii; 19 using mozilla::IsTrailingUnit; 20 using mozilla::Maybe; 21 using mozilla::Utf8Unit; 22 23 using JS::Latin1Char; 24 using JS::MutableHandle; 25 using JS::Rooted; 26 27 BEGIN_TEST(testAtomizeTwoByteUTF8) { 28 Rooted<JSAtom*> atom16(cx); 29 Rooted<JSAtom*> atom8(cx); 30 31 for (uint16_t i = 0; i <= UINT8_MAX; i++) { 32 // Test cases where the first unit is ASCII. 33 if (IsAscii(char16_t(i))) { 34 for (uint16_t j = 0; j <= UINT8_MAX; j++) { 35 if (IsAscii(char16_t(j))) { 36 // If both units are ASCII, the sequence encodes a two-code point 37 // string. 38 if (!shouldBeTwoCodePoints(i, j, &atom16, &atom8)) { 39 return false; 40 } 41 } else { 42 // ASCII followed by non-ASCII should be invalid. 43 if (!shouldBeInvalid(i, j)) { 44 return false; 45 } 46 } 47 } 48 49 continue; 50 } 51 52 // Test remaining cases where the first unit isn't a two-byte lead. 53 if ((i & 0b1110'0000) != 0b1100'0000) { 54 for (uint16_t j = 0; j <= UINT8_MAX; j++) { 55 // If the first unit isn't a two-byte lead, the sequence is invalid no 56 // matter what the second unit is. 57 if (!shouldBeInvalid(i, j)) { 58 return false; 59 } 60 } 61 62 continue; 63 } 64 65 // Test remaining cases where the first unit is the two-byte lead of a 66 // non-Latin-1 code point. 67 if (i >= 0b1100'0100) { 68 for (uint16_t j = 0; j <= UINT8_MAX; j++) { 69 if (IsTrailingUnit(Utf8Unit(static_cast<uint8_t>(j)))) { 70 if (!shouldBeSingleNonLatin1(i, j, &atom16, &atom8)) { 71 return false; 72 } 73 } else { 74 if (!shouldBeInvalid(i, j)) { 75 return false; 76 } 77 } 78 } 79 80 continue; 81 } 82 83 // Test remaining cases where the first unit is the two-byte lead of an 84 // overlong ASCII code point. 85 if (i < 0b1100'0010) { 86 for (uint16_t j = 0; j <= UINT8_MAX; j++) { 87 if (!shouldBeInvalid(i, j)) { 88 return false; 89 } 90 } 91 92 continue; 93 } 94 95 // Finally, test remaining cases where the first unit is the two-byte lead 96 // of a Latin-1 code point. 97 for (uint16_t j = 0; j <= UINT8_MAX; j++) { 98 if (IsTrailingUnit(Utf8Unit(static_cast<uint8_t>(j)))) { 99 if (!shouldBeSingleLatin1(i, j, &atom16, &atom8)) { 100 return false; 101 } 102 } else { 103 if (!shouldBeInvalid(i, j)) { 104 return false; 105 } 106 } 107 } 108 } 109 110 return true; 111 } 112 113 bool shouldBeTwoCodePoints(uint16_t first, uint16_t second, 114 MutableHandle<JSAtom*> atom16, 115 MutableHandle<JSAtom*> atom8) { 116 CHECK(first <= UINT8_MAX); 117 CHECK(second <= UINT8_MAX); 118 CHECK(IsAscii(char16_t(first))); 119 CHECK(IsAscii(char16_t(second))); 120 121 const char16_t utf16[] = {static_cast<char16_t>(first), 122 static_cast<char16_t>(second)}; 123 atom16.set(js::AtomizeChars(cx, utf16, 2)); 124 CHECK(atom16); 125 CHECK(atom16->length() == 2); 126 CHECK(atom16->latin1OrTwoByteChar(0) == first); 127 CHECK(atom16->latin1OrTwoByteChar(1) == second); 128 129 const char utf8[] = {static_cast<char>(first), static_cast<char>(second)}; 130 atom8.set(js::AtomizeUTF8Chars(cx, utf8, 2)); 131 CHECK(atom8); 132 CHECK(atom8->length() == 2); 133 CHECK(atom8->latin1OrTwoByteChar(0) == first); 134 CHECK(atom8->latin1OrTwoByteChar(1) == second); 135 136 CHECK(atom16 == atom8); 137 138 return true; 139 } 140 141 bool shouldBeOneCodePoint(uint16_t first, uint16_t second, char32_t v, 142 MutableHandle<JSAtom*> atom16, 143 MutableHandle<JSAtom*> atom8) { 144 CHECK(first <= UINT8_MAX); 145 CHECK(second <= UINT8_MAX); 146 CHECK(v <= UINT16_MAX); 147 148 const char16_t utf16[] = {static_cast<char16_t>(v)}; 149 atom16.set(js::AtomizeChars(cx, utf16, 1)); 150 CHECK(atom16); 151 CHECK(atom16->length() == 1); 152 CHECK(atom16->latin1OrTwoByteChar(0) == v); 153 154 const char utf8[] = {static_cast<char>(first), static_cast<char>(second)}; 155 atom8.set(js::AtomizeUTF8Chars(cx, utf8, 2)); 156 CHECK(atom8); 157 CHECK(atom8->length() == 1); 158 CHECK(atom8->latin1OrTwoByteChar(0) == v); 159 160 CHECK(atom16 == atom8); 161 162 return true; 163 } 164 165 bool shouldBeSingleNonLatin1(uint16_t first, uint16_t second, 166 MutableHandle<JSAtom*> atom16, 167 MutableHandle<JSAtom*> atom8) { 168 CHECK(first <= UINT8_MAX); 169 CHECK(second <= UINT8_MAX); 170 171 const char bytes[] = {static_cast<char>(first), static_cast<char>(second)}; 172 const char* iter = &bytes[1]; 173 Maybe<char32_t> cp = 174 DecodeOneUtf8CodePoint(Utf8Unit(bytes[0]), &iter, bytes + 2); 175 CHECK(cp.isSome()); 176 177 char32_t v = cp.value(); 178 CHECK(v > UINT8_MAX); 179 180 return shouldBeOneCodePoint(first, second, v, atom16, atom8); 181 } 182 183 bool shouldBeSingleLatin1(uint16_t first, uint16_t second, 184 MutableHandle<JSAtom*> atom16, 185 MutableHandle<JSAtom*> atom8) { 186 CHECK(first <= UINT8_MAX); 187 CHECK(second <= UINT8_MAX); 188 189 const char bytes[] = {static_cast<char>(first), static_cast<char>(second)}; 190 const char* iter = &bytes[1]; 191 Maybe<char32_t> cp = 192 DecodeOneUtf8CodePoint(Utf8Unit(bytes[0]), &iter, bytes + 2); 193 CHECK(cp.isSome()); 194 195 char32_t v = cp.value(); 196 CHECK(v <= UINT8_MAX); 197 198 return shouldBeOneCodePoint(first, second, v, atom16, atom8); 199 } 200 201 bool shouldBeInvalid(uint16_t first, uint16_t second) { 202 CHECK(first <= UINT8_MAX); 203 CHECK(second <= UINT8_MAX); 204 205 const char invalid[] = {static_cast<char>(first), static_cast<char>(second)}; 206 CHECK(!js::AtomizeUTF8Chars(cx, invalid, 2)); 207 CHECK(JS_IsExceptionPending(cx)); 208 JS_ClearPendingException(cx); 209 210 return true; 211 } 212 END_TEST(testAtomizeTwoByteUTF8)