TextUtils.h (8938B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* Character/text operations. */ 8 9 #ifndef mozilla_TextUtils_h 10 #define mozilla_TextUtils_h 11 12 #include "mozilla/Assertions.h" 13 #include "mozilla/Latin1.h" 14 15 #ifdef MOZ_HAS_JSRUST 16 // Can't include mozilla/Encoding.h here. 17 extern "C" { 18 // Declared as uint8_t instead of char to match declaration in another header. 19 size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len); 20 } 21 #endif 22 23 namespace mozilla { 24 25 // See Utf8.h for IsUtf8() and conversions between UTF-8 and UTF-16. 26 // See Latin1.h for testing UTF-16 and UTF-8 for Latin1ness and 27 // for conversions to and from Latin1. 28 29 // The overloads below are not templated in order to make 30 // implicit conversions to span work as expected for the Span 31 // overloads. 32 33 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 34 inline constexpr bool IsAscii(unsigned char aChar) { return aChar < 0x80; } 35 36 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 37 inline constexpr bool IsAscii(signed char aChar) { 38 return IsAscii(static_cast<unsigned char>(aChar)); 39 } 40 41 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 42 inline constexpr bool IsAscii(char aChar) { 43 return IsAscii(static_cast<unsigned char>(aChar)); 44 } 45 46 #ifdef __cpp_char8_t 47 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 48 inline constexpr bool IsAscii(char8_t aChar) { 49 return IsAscii(static_cast<unsigned char>(aChar)); 50 } 51 #endif 52 53 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 54 inline constexpr bool IsAscii(char16_t aChar) { return aChar < 0x80; } 55 56 /** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */ 57 inline constexpr bool IsAscii(char32_t aChar) { return aChar < 0x80; } 58 59 /** 60 * Returns |true| iff |aString| contains only ASCII characters, that is, 61 * characters in the range [0x00, 0x80). 62 * 63 * @param aString a 8-bit wide string to scan 64 */ 65 inline bool IsAscii(mozilla::Span<const char> aString) { 66 #if MOZ_HAS_JSRUST() 67 size_t length = aString.Length(); 68 const char* ptr = aString.Elements(); 69 // For short strings, avoid the function call, since, the SIMD 70 // code won't have a chance to kick in anyway. 71 if (length < mozilla::detail::kShortStringLimitForInlinePaths) { 72 const uint8_t* uptr = reinterpret_cast<const uint8_t*>(ptr); 73 uint8_t accu = 0; 74 for (size_t i = 0; i < length; i++) { 75 accu |= uptr[i]; 76 } 77 return accu < 0x80; 78 } 79 return encoding_mem_is_ascii(ptr, length); 80 #else 81 for (char c : aString) { 82 if (!IsAscii(c)) { 83 return false; 84 } 85 } 86 return true; 87 #endif 88 } 89 90 /** 91 * Returns |true| iff |aString| contains only ASCII characters, that is, 92 * characters in the range [0x00, 0x80). 93 * 94 * @param aString a 16-bit wide string to scan 95 */ 96 inline bool IsAscii(mozilla::Span<const char16_t> aString) { 97 #if MOZ_HAS_JSRUST() 98 size_t length = aString.Length(); 99 const char16_t* ptr = aString.Elements(); 100 // For short strings, calling into Rust is a pessimization, and the SIMD 101 // code won't have a chance to kick in anyway. 102 // 16 is a bit larger than logically necessary for this function alone, 103 // but it's important that the limit here matches the limit used in 104 // LossyConvertUtf16toLatin1! 105 if (length < mozilla::detail::kShortStringLimitForInlinePaths) { 106 char16_t accu = 0; 107 for (size_t i = 0; i < length; i++) { 108 accu |= ptr[i]; 109 } 110 return accu < 0x80; 111 } 112 return encoding_mem_is_basic_latin(ptr, length); 113 #else 114 for (char16_t c : aString) { 115 if (!IsAscii(c)) { 116 return false; 117 } 118 } 119 return true; 120 #endif 121 } 122 123 /** 124 * Returns true iff every character in the null-terminated string pointed to by 125 * |aChar| is ASCII, i.e. in the range [0, 0x80). 126 */ 127 template <typename Char> 128 constexpr bool IsAsciiNullTerminated(const Char* aChar) { 129 while (Char c = *aChar++) { 130 if (!IsAscii(c)) { 131 return false; 132 } 133 } 134 return true; 135 } 136 137 #if MOZ_HAS_JSRUST() 138 /** 139 * Returns the index of the first non-ASCII byte or 140 * the length of the string if there are none. 141 */ 142 inline size_t AsciiValidUpTo(mozilla::Span<const char> aString) { 143 return encoding_ascii_valid_up_to( 144 reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length()); 145 } 146 147 /** 148 * Returns the index of the first unpaired surrogate or 149 * the length of the string if there are none. 150 */ 151 inline size_t Utf16ValidUpTo(mozilla::Span<const char16_t> aString) { 152 return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length()); 153 } 154 155 /** 156 * Replaces unpaired surrogates with U+FFFD in the argument. 157 * 158 * Note: If you have an nsAString, use EnsureUTF16Validity() from 159 * nsReadableUtils.h instead to avoid unsharing a valid shared 160 * string. 161 */ 162 inline void EnsureUtf16ValiditySpan(mozilla::Span<char16_t> aString) { 163 encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length()); 164 } 165 166 /** 167 * Convert ASCII to UTF-16. In debug builds, assert that the input is 168 * ASCII. 169 * 170 * The length of aDest must not be less than the length of aSource. 171 */ 172 inline void ConvertAsciitoUtf16(mozilla::Span<const char> aSource, 173 mozilla::Span<char16_t> aDest) { 174 MOZ_ASSERT(IsAscii(aSource)); 175 ConvertLatin1toUtf16(aSource, aDest); 176 } 177 178 #endif // MOZ_HAS_JSRUST 179 180 /** 181 * Returns true iff |aChar| matches Ascii Whitespace. 182 * 183 * This function is intended to match the Infra standard 184 * (https://infra.spec.whatwg.org/#ascii-whitespace) 185 */ 186 template <typename Char> 187 constexpr bool IsAsciiWhitespace(Char aChar) { 188 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 189 auto uc = static_cast<UnsignedChar>(aChar); 190 return uc == 0x9 || uc == 0xA || uc == 0xC || uc == 0xD || uc == 0x20; 191 } 192 193 /** 194 * Returns true iff |aChar| matches [a-z]. 195 * 196 * This function is basically what you thought islower was, except its behavior 197 * doesn't depend on the user's current locale. 198 */ 199 template <typename Char> 200 constexpr bool IsAsciiLowercaseAlpha(Char aChar) { 201 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 202 auto uc = static_cast<UnsignedChar>(aChar); 203 return 'a' <= uc && uc <= 'z'; 204 } 205 206 /** 207 * Returns true iff |aChar| matches [A-Z]. 208 * 209 * This function is basically what you thought isupper was, except its behavior 210 * doesn't depend on the user's current locale. 211 */ 212 template <typename Char> 213 constexpr bool IsAsciiUppercaseAlpha(Char aChar) { 214 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 215 auto uc = static_cast<UnsignedChar>(aChar); 216 return 'A' <= uc && uc <= 'Z'; 217 } 218 219 /** 220 * Returns true iff |aChar| matches [a-zA-Z]. 221 * 222 * This function is basically what you thought isalpha was, except its behavior 223 * doesn't depend on the user's current locale. 224 */ 225 template <typename Char> 226 constexpr bool IsAsciiAlpha(Char aChar) { 227 return IsAsciiLowercaseAlpha(aChar) || IsAsciiUppercaseAlpha(aChar); 228 } 229 230 /** 231 * Returns true iff |aChar| matches [0-9]. 232 * 233 * This function is basically what you thought isdigit was, except its behavior 234 * doesn't depend on the user's current locale. 235 */ 236 template <typename Char> 237 constexpr bool IsAsciiDigit(Char aChar) { 238 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 239 auto uc = static_cast<UnsignedChar>(aChar); 240 return '0' <= uc && uc <= '9'; 241 } 242 243 /** 244 * Returns true iff |aChar| matches [0-9a-fA-F]. 245 * 246 * This function is basically isxdigit, but guaranteed to be only for ASCII. 247 */ 248 template <typename Char> 249 constexpr bool IsAsciiHexDigit(Char aChar) { 250 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 251 auto uc = static_cast<UnsignedChar>(aChar); 252 return ('0' <= uc && uc <= '9') || ('a' <= uc && uc <= 'f') || 253 ('A' <= uc && uc <= 'F'); 254 } 255 256 /** 257 * Returns true iff |aChar| matches [a-zA-Z0-9]. 258 * 259 * This function is basically what you thought isalnum was, except its behavior 260 * doesn't depend on the user's current locale. 261 */ 262 template <typename Char> 263 constexpr bool IsAsciiAlphanumeric(Char aChar) { 264 return IsAsciiDigit(aChar) || IsAsciiAlpha(aChar); 265 } 266 267 /** 268 * Converts an ASCII alphanumeric digit [0-9a-zA-Z] to number as if in base-36. 269 * (This function therefore works for decimal, hexadecimal, etc.). 270 */ 271 template <typename Char> 272 constexpr uint8_t AsciiAlphanumericToNumber(Char aChar) { 273 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 274 auto uc = static_cast<UnsignedChar>(aChar); 275 276 if ('0' <= uc && uc <= '9') { 277 return uc - '0'; 278 } 279 280 if ('A' <= uc && uc <= 'Z') { 281 return uc - 'A' + 10; 282 } 283 284 MOZ_ASSERT(IsAsciiLowercaseAlpha(aChar), 285 "non-ASCII alphanumeric character can't be converted to number"); 286 return uc - 'a' + 10; 287 } 288 289 } // namespace mozilla 290 291 #endif /* mozilla_TextUtils_h */