Text.h (12490B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef util_Text_h 8 #define util_Text_h 9 10 #include "mozilla/ArrayUtils.h" 11 #include "mozilla/Assertions.h" 12 #include "mozilla/Attributes.h" 13 #include "mozilla/Casting.h" 14 #include "mozilla/Latin1.h" 15 #include "mozilla/Likely.h" 16 #include "mozilla/TextUtils.h" 17 #include "mozilla/Utf8.h" 18 19 #include <algorithm> 20 #include <stddef.h> 21 #include <stdint.h> 22 #include <string> 23 #include <type_traits> 24 #include <utility> 25 26 #include "NamespaceImports.h" 27 28 #include "js/Utility.h" 29 #include "util/Unicode.h" 30 31 namespace js { 32 class FrontendContext; 33 } // namespace js 34 35 class JSLinearString; 36 37 template <typename CharT> 38 static constexpr MOZ_ALWAYS_INLINE size_t js_strlen(const CharT* s) { 39 if constexpr (std::is_same_v<CharT, JS::Latin1Char>) { 40 return std::char_traits<char>::length(reinterpret_cast<const char*>(s)); 41 } else { 42 return std::char_traits<CharT>::length(s); 43 } 44 } 45 46 template <typename CharT> 47 extern const CharT* js_strchr_limit(const CharT* s, char16_t c, 48 const CharT* limit); 49 50 template <typename CharT> 51 static MOZ_ALWAYS_INLINE size_t js_strnlen(const CharT* s, size_t maxlen) { 52 for (size_t i = 0; i < maxlen; ++i) { 53 if (s[i] == '\0') { 54 return i; 55 } 56 } 57 return maxlen; 58 } 59 60 namespace js { 61 62 class JS_PUBLIC_API GenericPrinter; 63 64 template <typename CharT> 65 constexpr uint8_t AsciiDigitToNumber(CharT c) { 66 using UnsignedCharT = std::make_unsigned_t<CharT>; 67 auto uc = static_cast<UnsignedCharT>(c); 68 return uc - '0'; 69 } 70 71 template <typename CharT> 72 static constexpr bool IsAsciiPrintable(CharT c) { 73 using UnsignedCharT = std::make_unsigned_t<CharT>; 74 auto uc = static_cast<UnsignedCharT>(c); 75 return ' ' <= uc && uc <= '~'; 76 } 77 78 template <typename Char1, typename Char2> 79 inline bool EqualChars(const Char1* s1, const Char2* s2, size_t len) { 80 // Cast |JS::Latin1Char| to |char| to ensure compilers emit std::memcmp for 81 // the comparison. 82 if constexpr (std::is_same_v<Char1, char> && 83 std::is_same_v<Char2, JS::Latin1Char>) { 84 return mozilla::ArrayEqual(s1, reinterpret_cast<const char*>(s2), len); 85 } else if constexpr (std::is_same_v<Char1, JS::Latin1Char> && 86 std::is_same_v<Char2, char>) { 87 return mozilla::ArrayEqual(reinterpret_cast<const char*>(s1), s2, len); 88 } else { 89 return mozilla::ArrayEqual(s1, s2, len); 90 } 91 } 92 93 // Return less than, equal to, or greater than zero depending on whether 94 // s1 is less than, equal to, or greater than s2. 95 template <typename Char1, typename Char2> 96 inline int32_t CompareChars(const Char1* s1, size_t len1, const Char2* s2, 97 size_t len2) { 98 size_t n = std::min(len1, len2); 99 for (size_t i = 0; i < n; i++) { 100 if (int32_t cmp = s1[i] - s2[i]) { 101 return cmp; 102 } 103 } 104 105 return int32_t(len1 - len2); 106 } 107 108 // Return s advanced past any Unicode white space characters. 109 template <typename CharT> 110 static inline const CharT* SkipSpace(const CharT* s, const CharT* end) { 111 MOZ_ASSERT(s <= end); 112 113 while (s < end && unicode::IsSpace(*s)) { 114 s++; 115 } 116 117 return s; 118 } 119 120 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx, 121 const char* s); 122 123 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx, 124 const char* s, size_t n); 125 126 extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId, 127 JSContext* cx, 128 const Latin1Char* s, size_t n); 129 130 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId, 131 JSContext* cx, 132 const char16_t* s); 133 134 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId, 135 JSContext* cx, 136 const char16_t* s, size_t n); 137 138 /* 139 * These variants do not report OOMs, you must arrange for OOMs to be reported 140 * yourself. 141 */ 142 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, 143 const char* s); 144 145 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, const char* s, 146 size_t n); 147 148 extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId, 149 const JS::Latin1Char* s, 150 size_t n); 151 152 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId, 153 const char16_t* s); 154 155 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId, 156 const char16_t* s, size_t n); 157 158 extern UniqueChars DuplicateString(JSContext* cx, const char* s); 159 extern UniqueChars DuplicateString(FrontendContext* fc, const char* s); 160 161 extern UniqueChars DuplicateString(JSContext* cx, const char* s, size_t n); 162 163 extern UniqueLatin1Chars DuplicateString(JSContext* cx, const JS::Latin1Char* s, 164 size_t n); 165 166 extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s); 167 extern UniqueTwoByteChars DuplicateString(FrontendContext* fc, 168 const char16_t* s); 169 170 extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s, 171 size_t n); 172 173 /* 174 * These variants do not report OOMs, you must arrange for OOMs to be reported 175 * yourself. 176 */ 177 extern UniqueChars DuplicateString(const char* s); 178 179 extern UniqueChars DuplicateString(const char* s, size_t n); 180 181 extern UniqueLatin1Chars DuplicateString(const JS::Latin1Char* s, size_t n); 182 183 extern UniqueTwoByteChars DuplicateString(const char16_t* s); 184 185 extern UniqueTwoByteChars DuplicateString(const char16_t* s, size_t n); 186 187 /* 188 * Inflate bytes in ASCII encoding to char16_t code units. Return null on error, 189 * otherwise return the char16_t buffer that was malloc'ed. A null char is 190 * appended. 191 */ 192 extern char16_t* InflateString(JSContext* cx, const char* bytes, size_t length); 193 194 /** 195 * For a valid UTF-8, Latin-1, or WTF-16 code unit sequence, expose its contents 196 * as the sequence of WTF-16 |char16_t| code units that would identically 197 * constitute it. 198 */ 199 template <typename CharT> 200 class InflatedChar16Sequence { 201 private: 202 const CharT* units_; 203 const CharT* limit_; 204 205 static_assert(std::is_same_v<CharT, char16_t> || 206 std::is_same_v<CharT, JS::Latin1Char>, 207 "InflatedChar16Sequence only supports UTF-8/Latin-1/WTF-16"); 208 209 public: 210 InflatedChar16Sequence(const CharT* units, size_t len) 211 : units_(units), limit_(units_ + len) {} 212 213 bool hasMore() { return units_ < limit_; } 214 215 char16_t next() { 216 MOZ_ASSERT(hasMore()); 217 return static_cast<char16_t>(*units_++); 218 } 219 220 HashNumber computeHash() const { 221 auto copy = *this; 222 HashNumber hash = 0; 223 while (copy.hasMore()) { 224 hash = mozilla::AddToHash(hash, copy.next()); 225 } 226 return hash; 227 } 228 }; 229 230 template <> 231 class InflatedChar16Sequence<mozilla::Utf8Unit> { 232 private: 233 const mozilla::Utf8Unit* units_; 234 const mozilla::Utf8Unit* limit_; 235 236 char16_t pendingTrailingSurrogate_ = 0; 237 238 public: 239 InflatedChar16Sequence(const mozilla::Utf8Unit* units, size_t len) 240 : units_(units), limit_(units + len) {} 241 242 bool hasMore() { return pendingTrailingSurrogate_ || units_ < limit_; } 243 244 char16_t next() { 245 MOZ_ASSERT(hasMore()); 246 247 if (MOZ_UNLIKELY(pendingTrailingSurrogate_)) { 248 char16_t trail = 0; 249 std::swap(pendingTrailingSurrogate_, trail); 250 return trail; 251 } 252 253 mozilla::Utf8Unit unit = *units_++; 254 if (mozilla::IsAscii(unit)) { 255 return static_cast<char16_t>(unit.toUint8()); 256 } 257 258 mozilla::Maybe<char32_t> cp = 259 mozilla::DecodeOneUtf8CodePoint(unit, &units_, limit_); 260 MOZ_ASSERT(cp.isSome(), "input code unit sequence required to be valid"); 261 262 char32_t v = cp.value(); 263 if (v < unicode::NonBMPMin) { 264 return mozilla::AssertedCast<char16_t>(v); 265 } 266 267 char16_t lead; 268 unicode::UTF16Encode(v, &lead, &pendingTrailingSurrogate_); 269 270 MOZ_ASSERT(unicode::IsLeadSurrogate(lead)); 271 272 MOZ_ASSERT(pendingTrailingSurrogate_ != 0, 273 "pendingTrailingSurrogate_ must be nonzero to be detected and " 274 "returned next go-around"); 275 MOZ_ASSERT(unicode::IsTrailSurrogate(pendingTrailingSurrogate_)); 276 277 return lead; 278 } 279 280 HashNumber computeHash() const { 281 auto copy = *this; 282 HashNumber hash = 0; 283 while (copy.hasMore()) { 284 hash = mozilla::AddToHash(hash, copy.next()); 285 } 286 return hash; 287 } 288 }; 289 290 /* 291 * Inflate bytes to JS chars in an existing buffer. 'dst' must be large 292 * enough for 'srclen' char16_t code units. The buffer is NOT null-terminated. 293 */ 294 inline void CopyAndInflateChars(char16_t* dst, const char* src, size_t srclen) { 295 mozilla::ConvertLatin1toUtf16(mozilla::Span(src, srclen), 296 mozilla::Span(dst, srclen)); 297 } 298 299 inline void CopyAndInflateChars(char16_t* dst, const JS::Latin1Char* src, 300 size_t srclen) { 301 mozilla::ConvertLatin1toUtf16(mozilla::AsChars(mozilla::Span(src, srclen)), 302 mozilla::Span(dst, srclen)); 303 } 304 305 /* 306 * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at 307 * least 4 bytes long. Return the number of UTF-8 bytes of data written. 308 */ 309 extern uint32_t OneUcs4ToUtf8Char(uint8_t* utf8Buffer, char32_t ucs4Char); 310 311 extern size_t PutEscapedStringImpl(char* buffer, size_t size, 312 GenericPrinter* out, 313 const JSLinearString* str, uint32_t quote); 314 315 template <typename CharT> 316 extern size_t PutEscapedStringImpl(char* buffer, size_t bufferSize, 317 GenericPrinter* out, const CharT* chars, 318 size_t length, uint32_t quote); 319 320 /* 321 * Write str into buffer escaping any non-printable or non-ASCII character 322 * using \escapes for JS string literals. 323 * Guarantees that a NUL is at the end of the buffer unless size is 0. Returns 324 * the length of the written output, NOT including the NUL. Thus, a return 325 * value of size or more means that the output was truncated. If buffer 326 * is null, just returns the length of the output. If quote is not 0, it must 327 * be a single or double quote character that will quote the output. 328 */ 329 inline size_t PutEscapedString(char* buffer, size_t size, 330 const JSLinearString* str, uint32_t quote) { 331 size_t n = PutEscapedStringImpl(buffer, size, nullptr, str, quote); 332 333 /* PutEscapedStringImpl can only fail with a file. */ 334 MOZ_ASSERT(n != size_t(-1)); 335 return n; 336 } 337 338 template <typename CharT> 339 inline size_t PutEscapedString(char* buffer, size_t bufferSize, 340 const CharT* chars, size_t length, 341 uint32_t quote) { 342 size_t n = 343 PutEscapedStringImpl(buffer, bufferSize, nullptr, chars, length, quote); 344 345 /* PutEscapedStringImpl can only fail with a file. */ 346 MOZ_ASSERT(n != size_t(-1)); 347 return n; 348 } 349 350 inline bool EscapedStringPrinter(GenericPrinter& out, const JSLinearString* str, 351 uint32_t quote) { 352 return PutEscapedStringImpl(nullptr, 0, &out, str, quote) != size_t(-1); 353 } 354 355 JSString* EncodeURI(JSContext* cx, const char* chars, size_t length); 356 357 // Return true if input string contains a given flag in a comma separated list. 358 bool ContainsFlag(const char* str, const char* flag); 359 360 namespace unicode { 361 362 /** 363 * Compute the number of UTF-16 code units in the valid UTF-8 range 364 * [begin, end). 365 */ 366 extern size_t CountUTF16CodeUnits(const mozilla::Utf8Unit* begin, 367 const mozilla::Utf8Unit* end); 368 369 /** 370 * Count the number of UTF-16 code units in [begin, end). 371 */ 372 inline size_t CountUTF16CodeUnits(const char16_t* begin, const char16_t* end) { 373 MOZ_ASSERT(begin <= end); 374 return end - begin; 375 } 376 377 } // namespace unicode 378 379 } // namespace js 380 381 #endif // util_Text_h