tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TextUtils.h (8938B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /* Character/text operations. */
      8 
      9 #ifndef mozilla_TextUtils_h
     10 #define mozilla_TextUtils_h
     11 
     12 #include "mozilla/Assertions.h"
     13 #include "mozilla/Latin1.h"
     14 
     15 #ifdef MOZ_HAS_JSRUST
     16 // Can't include mozilla/Encoding.h here.
     17 extern "C" {
     18 // Declared as uint8_t instead of char to match declaration in another header.
     19 size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
     20 }
     21 #endif
     22 
     23 namespace mozilla {
     24 
     25 // See Utf8.h for IsUtf8() and conversions between UTF-8 and UTF-16.
     26 // See Latin1.h for testing UTF-16 and UTF-8 for Latin1ness and
     27 // for conversions to and from Latin1.
     28 
     29 // The overloads below are not templated in order to make
     30 // implicit conversions to span work as expected for the Span
     31 // overloads.
     32 
     33 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     34 inline constexpr bool IsAscii(unsigned char aChar) { return aChar < 0x80; }
     35 
     36 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     37 inline constexpr bool IsAscii(signed char aChar) {
     38  return IsAscii(static_cast<unsigned char>(aChar));
     39 }
     40 
     41 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     42 inline constexpr bool IsAscii(char aChar) {
     43  return IsAscii(static_cast<unsigned char>(aChar));
     44 }
     45 
     46 #ifdef __cpp_char8_t
     47 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     48 inline constexpr bool IsAscii(char8_t aChar) {
     49  return IsAscii(static_cast<unsigned char>(aChar));
     50 }
     51 #endif
     52 
     53 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     54 inline constexpr bool IsAscii(char16_t aChar) { return aChar < 0x80; }
     55 
     56 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
     57 inline constexpr bool IsAscii(char32_t aChar) { return aChar < 0x80; }
     58 
     59 /**
     60 * Returns |true| iff |aString| contains only ASCII characters, that is,
     61 * characters in the range [0x00, 0x80).
     62 *
     63 * @param aString a 8-bit wide string to scan
     64 */
     65 inline bool IsAscii(mozilla::Span<const char> aString) {
     66 #if MOZ_HAS_JSRUST()
     67  size_t length = aString.Length();
     68  const char* ptr = aString.Elements();
     69  // For short strings, avoid the function call, since, the SIMD
     70  // code won't have a chance to kick in anyway.
     71  if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
     72    const uint8_t* uptr = reinterpret_cast<const uint8_t*>(ptr);
     73    uint8_t accu = 0;
     74    for (size_t i = 0; i < length; i++) {
     75      accu |= uptr[i];
     76    }
     77    return accu < 0x80;
     78  }
     79  return encoding_mem_is_ascii(ptr, length);
     80 #else
     81  for (char c : aString) {
     82    if (!IsAscii(c)) {
     83      return false;
     84    }
     85  }
     86  return true;
     87 #endif
     88 }
     89 
     90 /**
     91 * Returns |true| iff |aString| contains only ASCII characters, that is,
     92 * characters in the range [0x00, 0x80).
     93 *
     94 * @param aString a 16-bit wide string to scan
     95 */
     96 inline bool IsAscii(mozilla::Span<const char16_t> aString) {
     97 #if MOZ_HAS_JSRUST()
     98  size_t length = aString.Length();
     99  const char16_t* ptr = aString.Elements();
    100  // For short strings, calling into Rust is a pessimization, and the SIMD
    101  // code won't have a chance to kick in anyway.
    102  // 16 is a bit larger than logically necessary for this function alone,
    103  // but it's important that the limit here matches the limit used in
    104  // LossyConvertUtf16toLatin1!
    105  if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
    106    char16_t accu = 0;
    107    for (size_t i = 0; i < length; i++) {
    108      accu |= ptr[i];
    109    }
    110    return accu < 0x80;
    111  }
    112  return encoding_mem_is_basic_latin(ptr, length);
    113 #else
    114  for (char16_t c : aString) {
    115    if (!IsAscii(c)) {
    116      return false;
    117    }
    118  }
    119  return true;
    120 #endif
    121 }
    122 
    123 /**
    124 * Returns true iff every character in the null-terminated string pointed to by
    125 * |aChar| is ASCII, i.e. in the range [0, 0x80).
    126 */
    127 template <typename Char>
    128 constexpr bool IsAsciiNullTerminated(const Char* aChar) {
    129  while (Char c = *aChar++) {
    130    if (!IsAscii(c)) {
    131      return false;
    132    }
    133  }
    134  return true;
    135 }
    136 
    137 #if MOZ_HAS_JSRUST()
    138 /**
    139 * Returns the index of the first non-ASCII byte or
    140 * the length of the string if there are none.
    141 */
    142 inline size_t AsciiValidUpTo(mozilla::Span<const char> aString) {
    143  return encoding_ascii_valid_up_to(
    144      reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
    145 }
    146 
    147 /**
    148 * Returns the index of the first unpaired surrogate or
    149 * the length of the string if there are none.
    150 */
    151 inline size_t Utf16ValidUpTo(mozilla::Span<const char16_t> aString) {
    152  return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length());
    153 }
    154 
    155 /**
    156 * Replaces unpaired surrogates with U+FFFD in the argument.
    157 *
    158 * Note: If you have an nsAString, use EnsureUTF16Validity() from
    159 * nsReadableUtils.h instead to avoid unsharing a valid shared
    160 * string.
    161 */
    162 inline void EnsureUtf16ValiditySpan(mozilla::Span<char16_t> aString) {
    163  encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length());
    164 }
    165 
    166 /**
    167 * Convert ASCII to UTF-16. In debug builds, assert that the input is
    168 * ASCII.
    169 *
    170 * The length of aDest must not be less than the length of aSource.
    171 */
    172 inline void ConvertAsciitoUtf16(mozilla::Span<const char> aSource,
    173                                mozilla::Span<char16_t> aDest) {
    174  MOZ_ASSERT(IsAscii(aSource));
    175  ConvertLatin1toUtf16(aSource, aDest);
    176 }
    177 
    178 #endif  // MOZ_HAS_JSRUST
    179 
    180 /**
    181 * Returns true iff |aChar| matches Ascii Whitespace.
    182 *
    183 * This function is intended to match the Infra standard
    184 * (https://infra.spec.whatwg.org/#ascii-whitespace)
    185 */
    186 template <typename Char>
    187 constexpr bool IsAsciiWhitespace(Char aChar) {
    188  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    189  auto uc = static_cast<UnsignedChar>(aChar);
    190  return uc == 0x9 || uc == 0xA || uc == 0xC || uc == 0xD || uc == 0x20;
    191 }
    192 
    193 /**
    194 * Returns true iff |aChar| matches [a-z].
    195 *
    196 * This function is basically what you thought islower was, except its behavior
    197 * doesn't depend on the user's current locale.
    198 */
    199 template <typename Char>
    200 constexpr bool IsAsciiLowercaseAlpha(Char aChar) {
    201  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    202  auto uc = static_cast<UnsignedChar>(aChar);
    203  return 'a' <= uc && uc <= 'z';
    204 }
    205 
    206 /**
    207 * Returns true iff |aChar| matches [A-Z].
    208 *
    209 * This function is basically what you thought isupper was, except its behavior
    210 * doesn't depend on the user's current locale.
    211 */
    212 template <typename Char>
    213 constexpr bool IsAsciiUppercaseAlpha(Char aChar) {
    214  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    215  auto uc = static_cast<UnsignedChar>(aChar);
    216  return 'A' <= uc && uc <= 'Z';
    217 }
    218 
    219 /**
    220 * Returns true iff |aChar| matches [a-zA-Z].
    221 *
    222 * This function is basically what you thought isalpha was, except its behavior
    223 * doesn't depend on the user's current locale.
    224 */
    225 template <typename Char>
    226 constexpr bool IsAsciiAlpha(Char aChar) {
    227  return IsAsciiLowercaseAlpha(aChar) || IsAsciiUppercaseAlpha(aChar);
    228 }
    229 
    230 /**
    231 * Returns true iff |aChar| matches [0-9].
    232 *
    233 * This function is basically what you thought isdigit was, except its behavior
    234 * doesn't depend on the user's current locale.
    235 */
    236 template <typename Char>
    237 constexpr bool IsAsciiDigit(Char aChar) {
    238  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    239  auto uc = static_cast<UnsignedChar>(aChar);
    240  return '0' <= uc && uc <= '9';
    241 }
    242 
    243 /**
    244 * Returns true iff |aChar| matches [0-9a-fA-F].
    245 *
    246 * This function is basically isxdigit, but guaranteed to be only for ASCII.
    247 */
    248 template <typename Char>
    249 constexpr bool IsAsciiHexDigit(Char aChar) {
    250  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    251  auto uc = static_cast<UnsignedChar>(aChar);
    252  return ('0' <= uc && uc <= '9') || ('a' <= uc && uc <= 'f') ||
    253         ('A' <= uc && uc <= 'F');
    254 }
    255 
    256 /**
    257 * Returns true iff |aChar| matches [a-zA-Z0-9].
    258 *
    259 * This function is basically what you thought isalnum was, except its behavior
    260 * doesn't depend on the user's current locale.
    261 */
    262 template <typename Char>
    263 constexpr bool IsAsciiAlphanumeric(Char aChar) {
    264  return IsAsciiDigit(aChar) || IsAsciiAlpha(aChar);
    265 }
    266 
    267 /**
    268 * Converts an ASCII alphanumeric digit [0-9a-zA-Z] to number as if in base-36.
    269 * (This function therefore works for decimal, hexadecimal, etc.).
    270 */
    271 template <typename Char>
    272 constexpr uint8_t AsciiAlphanumericToNumber(Char aChar) {
    273  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
    274  auto uc = static_cast<UnsignedChar>(aChar);
    275 
    276  if ('0' <= uc && uc <= '9') {
    277    return uc - '0';
    278  }
    279 
    280  if ('A' <= uc && uc <= 'Z') {
    281    return uc - 'A' + 10;
    282  }
    283 
    284  MOZ_ASSERT(IsAsciiLowercaseAlpha(aChar),
    285             "non-ASCII alphanumeric character can't be converted to number");
    286  return uc - 'a' + 10;
    287 }
    288 
    289 }  // namespace mozilla
    290 
    291 #endif /* mozilla_TextUtils_h */