tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Text.h (12490B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef util_Text_h
      8 #define util_Text_h
      9 
     10 #include "mozilla/ArrayUtils.h"
     11 #include "mozilla/Assertions.h"
     12 #include "mozilla/Attributes.h"
     13 #include "mozilla/Casting.h"
     14 #include "mozilla/Latin1.h"
     15 #include "mozilla/Likely.h"
     16 #include "mozilla/TextUtils.h"
     17 #include "mozilla/Utf8.h"
     18 
     19 #include <algorithm>
     20 #include <stddef.h>
     21 #include <stdint.h>
     22 #include <string>
     23 #include <type_traits>
     24 #include <utility>
     25 
     26 #include "NamespaceImports.h"
     27 
     28 #include "js/Utility.h"
     29 #include "util/Unicode.h"
     30 
     31 namespace js {
     32 class FrontendContext;
     33 }  // namespace js
     34 
     35 class JSLinearString;
     36 
     37 template <typename CharT>
     38 static constexpr MOZ_ALWAYS_INLINE size_t js_strlen(const CharT* s) {
     39  if constexpr (std::is_same_v<CharT, JS::Latin1Char>) {
     40    return std::char_traits<char>::length(reinterpret_cast<const char*>(s));
     41  } else {
     42    return std::char_traits<CharT>::length(s);
     43  }
     44 }
     45 
     46 template <typename CharT>
     47 extern const CharT* js_strchr_limit(const CharT* s, char16_t c,
     48                                    const CharT* limit);
     49 
     50 template <typename CharT>
     51 static MOZ_ALWAYS_INLINE size_t js_strnlen(const CharT* s, size_t maxlen) {
     52  for (size_t i = 0; i < maxlen; ++i) {
     53    if (s[i] == '\0') {
     54      return i;
     55    }
     56  }
     57  return maxlen;
     58 }
     59 
     60 namespace js {
     61 
     62 class JS_PUBLIC_API GenericPrinter;
     63 
     64 template <typename CharT>
     65 constexpr uint8_t AsciiDigitToNumber(CharT c) {
     66  using UnsignedCharT = std::make_unsigned_t<CharT>;
     67  auto uc = static_cast<UnsignedCharT>(c);
     68  return uc - '0';
     69 }
     70 
     71 template <typename CharT>
     72 static constexpr bool IsAsciiPrintable(CharT c) {
     73  using UnsignedCharT = std::make_unsigned_t<CharT>;
     74  auto uc = static_cast<UnsignedCharT>(c);
     75  return ' ' <= uc && uc <= '~';
     76 }
     77 
     78 template <typename Char1, typename Char2>
     79 inline bool EqualChars(const Char1* s1, const Char2* s2, size_t len) {
     80  // Cast |JS::Latin1Char| to |char| to ensure compilers emit std::memcmp for
     81  // the comparison.
     82  if constexpr (std::is_same_v<Char1, char> &&
     83                std::is_same_v<Char2, JS::Latin1Char>) {
     84    return mozilla::ArrayEqual(s1, reinterpret_cast<const char*>(s2), len);
     85  } else if constexpr (std::is_same_v<Char1, JS::Latin1Char> &&
     86                       std::is_same_v<Char2, char>) {
     87    return mozilla::ArrayEqual(reinterpret_cast<const char*>(s1), s2, len);
     88  } else {
     89    return mozilla::ArrayEqual(s1, s2, len);
     90  }
     91 }
     92 
     93 // Return less than, equal to, or greater than zero depending on whether
     94 // s1 is less than, equal to, or greater than s2.
     95 template <typename Char1, typename Char2>
     96 inline int32_t CompareChars(const Char1* s1, size_t len1, const Char2* s2,
     97                            size_t len2) {
     98  size_t n = std::min(len1, len2);
     99  for (size_t i = 0; i < n; i++) {
    100    if (int32_t cmp = s1[i] - s2[i]) {
    101      return cmp;
    102    }
    103  }
    104 
    105  return int32_t(len1 - len2);
    106 }
    107 
    108 // Return s advanced past any Unicode white space characters.
    109 template <typename CharT>
    110 static inline const CharT* SkipSpace(const CharT* s, const CharT* end) {
    111  MOZ_ASSERT(s <= end);
    112 
    113  while (s < end && unicode::IsSpace(*s)) {
    114    s++;
    115  }
    116 
    117  return s;
    118 }
    119 
    120 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx,
    121                                          const char* s);
    122 
    123 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, JSContext* cx,
    124                                          const char* s, size_t n);
    125 
    126 extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId,
    127                                                JSContext* cx,
    128                                                const Latin1Char* s, size_t n);
    129 
    130 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
    131                                                 JSContext* cx,
    132                                                 const char16_t* s);
    133 
    134 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
    135                                                 JSContext* cx,
    136                                                 const char16_t* s, size_t n);
    137 
    138 /*
    139 * These variants do not report OOMs, you must arrange for OOMs to be reported
    140 * yourself.
    141 */
    142 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId,
    143                                          const char* s);
    144 
    145 extern UniqueChars DuplicateStringToArena(arena_id_t destArenaId, const char* s,
    146                                          size_t n);
    147 
    148 extern UniqueLatin1Chars DuplicateStringToArena(arena_id_t destArenaId,
    149                                                const JS::Latin1Char* s,
    150                                                size_t n);
    151 
    152 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
    153                                                 const char16_t* s);
    154 
    155 extern UniqueTwoByteChars DuplicateStringToArena(arena_id_t destArenaId,
    156                                                 const char16_t* s, size_t n);
    157 
    158 extern UniqueChars DuplicateString(JSContext* cx, const char* s);
    159 extern UniqueChars DuplicateString(FrontendContext* fc, const char* s);
    160 
    161 extern UniqueChars DuplicateString(JSContext* cx, const char* s, size_t n);
    162 
    163 extern UniqueLatin1Chars DuplicateString(JSContext* cx, const JS::Latin1Char* s,
    164                                         size_t n);
    165 
    166 extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s);
    167 extern UniqueTwoByteChars DuplicateString(FrontendContext* fc,
    168                                          const char16_t* s);
    169 
    170 extern UniqueTwoByteChars DuplicateString(JSContext* cx, const char16_t* s,
    171                                          size_t n);
    172 
    173 /*
    174 * These variants do not report OOMs, you must arrange for OOMs to be reported
    175 * yourself.
    176 */
    177 extern UniqueChars DuplicateString(const char* s);
    178 
    179 extern UniqueChars DuplicateString(const char* s, size_t n);
    180 
    181 extern UniqueLatin1Chars DuplicateString(const JS::Latin1Char* s, size_t n);
    182 
    183 extern UniqueTwoByteChars DuplicateString(const char16_t* s);
    184 
    185 extern UniqueTwoByteChars DuplicateString(const char16_t* s, size_t n);
    186 
    187 /*
    188 * Inflate bytes in ASCII encoding to char16_t code units. Return null on error,
    189 * otherwise return the char16_t buffer that was malloc'ed. A null char is
    190 * appended.
    191 */
    192 extern char16_t* InflateString(JSContext* cx, const char* bytes, size_t length);
    193 
    194 /**
    195 * For a valid UTF-8, Latin-1, or WTF-16 code unit sequence, expose its contents
    196 * as the sequence of WTF-16 |char16_t| code units that would identically
    197 * constitute it.
    198 */
    199 template <typename CharT>
    200 class InflatedChar16Sequence {
    201 private:
    202  const CharT* units_;
    203  const CharT* limit_;
    204 
    205  static_assert(std::is_same_v<CharT, char16_t> ||
    206                    std::is_same_v<CharT, JS::Latin1Char>,
    207                "InflatedChar16Sequence only supports UTF-8/Latin-1/WTF-16");
    208 
    209 public:
    210  InflatedChar16Sequence(const CharT* units, size_t len)
    211      : units_(units), limit_(units_ + len) {}
    212 
    213  bool hasMore() { return units_ < limit_; }
    214 
    215  char16_t next() {
    216    MOZ_ASSERT(hasMore());
    217    return static_cast<char16_t>(*units_++);
    218  }
    219 
    220  HashNumber computeHash() const {
    221    auto copy = *this;
    222    HashNumber hash = 0;
    223    while (copy.hasMore()) {
    224      hash = mozilla::AddToHash(hash, copy.next());
    225    }
    226    return hash;
    227  }
    228 };
    229 
    230 template <>
    231 class InflatedChar16Sequence<mozilla::Utf8Unit> {
    232 private:
    233  const mozilla::Utf8Unit* units_;
    234  const mozilla::Utf8Unit* limit_;
    235 
    236  char16_t pendingTrailingSurrogate_ = 0;
    237 
    238 public:
    239  InflatedChar16Sequence(const mozilla::Utf8Unit* units, size_t len)
    240      : units_(units), limit_(units + len) {}
    241 
    242  bool hasMore() { return pendingTrailingSurrogate_ || units_ < limit_; }
    243 
    244  char16_t next() {
    245    MOZ_ASSERT(hasMore());
    246 
    247    if (MOZ_UNLIKELY(pendingTrailingSurrogate_)) {
    248      char16_t trail = 0;
    249      std::swap(pendingTrailingSurrogate_, trail);
    250      return trail;
    251    }
    252 
    253    mozilla::Utf8Unit unit = *units_++;
    254    if (mozilla::IsAscii(unit)) {
    255      return static_cast<char16_t>(unit.toUint8());
    256    }
    257 
    258    mozilla::Maybe<char32_t> cp =
    259        mozilla::DecodeOneUtf8CodePoint(unit, &units_, limit_);
    260    MOZ_ASSERT(cp.isSome(), "input code unit sequence required to be valid");
    261 
    262    char32_t v = cp.value();
    263    if (v < unicode::NonBMPMin) {
    264      return mozilla::AssertedCast<char16_t>(v);
    265    }
    266 
    267    char16_t lead;
    268    unicode::UTF16Encode(v, &lead, &pendingTrailingSurrogate_);
    269 
    270    MOZ_ASSERT(unicode::IsLeadSurrogate(lead));
    271 
    272    MOZ_ASSERT(pendingTrailingSurrogate_ != 0,
    273               "pendingTrailingSurrogate_ must be nonzero to be detected and "
    274               "returned next go-around");
    275    MOZ_ASSERT(unicode::IsTrailSurrogate(pendingTrailingSurrogate_));
    276 
    277    return lead;
    278  }
    279 
    280  HashNumber computeHash() const {
    281    auto copy = *this;
    282    HashNumber hash = 0;
    283    while (copy.hasMore()) {
    284      hash = mozilla::AddToHash(hash, copy.next());
    285    }
    286    return hash;
    287  }
    288 };
    289 
    290 /*
    291 * Inflate bytes to JS chars in an existing buffer. 'dst' must be large
    292 * enough for 'srclen' char16_t code units. The buffer is NOT null-terminated.
    293 */
    294 inline void CopyAndInflateChars(char16_t* dst, const char* src, size_t srclen) {
    295  mozilla::ConvertLatin1toUtf16(mozilla::Span(src, srclen),
    296                                mozilla::Span(dst, srclen));
    297 }
    298 
    299 inline void CopyAndInflateChars(char16_t* dst, const JS::Latin1Char* src,
    300                                size_t srclen) {
    301  mozilla::ConvertLatin1toUtf16(mozilla::AsChars(mozilla::Span(src, srclen)),
    302                                mozilla::Span(dst, srclen));
    303 }
    304 
    305 /*
    306 * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at
    307 * least 4 bytes long.  Return the number of UTF-8 bytes of data written.
    308 */
    309 extern uint32_t OneUcs4ToUtf8Char(uint8_t* utf8Buffer, char32_t ucs4Char);
    310 
    311 extern size_t PutEscapedStringImpl(char* buffer, size_t size,
    312                                   GenericPrinter* out,
    313                                   const JSLinearString* str, uint32_t quote);
    314 
    315 template <typename CharT>
    316 extern size_t PutEscapedStringImpl(char* buffer, size_t bufferSize,
    317                                   GenericPrinter* out, const CharT* chars,
    318                                   size_t length, uint32_t quote);
    319 
    320 /*
    321 * Write str into buffer escaping any non-printable or non-ASCII character
    322 * using \escapes for JS string literals.
    323 * Guarantees that a NUL is at the end of the buffer unless size is 0. Returns
    324 * the length of the written output, NOT including the NUL. Thus, a return
    325 * value of size or more means that the output was truncated. If buffer
    326 * is null, just returns the length of the output. If quote is not 0, it must
    327 * be a single or double quote character that will quote the output.
    328 */
    329 inline size_t PutEscapedString(char* buffer, size_t size,
    330                               const JSLinearString* str, uint32_t quote) {
    331  size_t n = PutEscapedStringImpl(buffer, size, nullptr, str, quote);
    332 
    333  /* PutEscapedStringImpl can only fail with a file. */
    334  MOZ_ASSERT(n != size_t(-1));
    335  return n;
    336 }
    337 
    338 template <typename CharT>
    339 inline size_t PutEscapedString(char* buffer, size_t bufferSize,
    340                               const CharT* chars, size_t length,
    341                               uint32_t quote) {
    342  size_t n =
    343      PutEscapedStringImpl(buffer, bufferSize, nullptr, chars, length, quote);
    344 
    345  /* PutEscapedStringImpl can only fail with a file. */
    346  MOZ_ASSERT(n != size_t(-1));
    347  return n;
    348 }
    349 
    350 inline bool EscapedStringPrinter(GenericPrinter& out, const JSLinearString* str,
    351                                 uint32_t quote) {
    352  return PutEscapedStringImpl(nullptr, 0, &out, str, quote) != size_t(-1);
    353 }
    354 
    355 JSString* EncodeURI(JSContext* cx, const char* chars, size_t length);
    356 
    357 // Return true if input string contains a given flag in a comma separated list.
    358 bool ContainsFlag(const char* str, const char* flag);
    359 
    360 namespace unicode {
    361 
    362 /**
    363 * Compute the number of UTF-16 code units in the valid UTF-8 range
    364 * [begin, end).
    365 */
    366 extern size_t CountUTF16CodeUnits(const mozilla::Utf8Unit* begin,
    367                                  const mozilla::Utf8Unit* end);
    368 
    369 /**
    370 * Count the number of UTF-16 code units in [begin, end).
    371 */
    372 inline size_t CountUTF16CodeUnits(const char16_t* begin, const char16_t* end) {
    373  MOZ_ASSERT(begin <= end);
    374  return end - begin;
    375 }
    376 
    377 }  // namespace unicode
    378 
    379 }  // namespace js
    380 
    381 #endif  // util_Text_h