Latin1.h (9028B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /* Latin-1 operations (i.e. a byte is the corresponding code point). 6 * (Note: this is *not* the same as the encoding of windows-1252 or 7 * latin1 content on the web. In Web terms, this encoding 8 * corresponds to "isomorphic decode" / "isomorphic encoding" from 9 * the Infra Standard.) 10 */ 11 12 #ifndef mozilla_Latin1_h 13 #define mozilla_Latin1_h 14 15 #include <type_traits> 16 17 #include "mozilla/JsRust.h" 18 #include "mozilla/Span.h" 19 20 #if MOZ_HAS_JSRUST() 21 # include "encoding_rs_mem.h" 22 #endif 23 24 namespace mozilla { 25 26 namespace detail { 27 28 // It's important for optimizations that Latin1ness checks 29 // and inflation/deflation function use the same short 30 // string limit. The limit is 16, because that's the shortest 31 // that inflates/deflates using SIMD. 32 constexpr size_t kShortStringLimitForInlinePaths = 16; 33 34 template <typename Char> 35 class MakeUnsignedChar { 36 public: 37 using Type = std::make_unsigned_t<Char>; 38 }; 39 40 template <> 41 class MakeUnsignedChar<char16_t> { 42 public: 43 using Type = char16_t; 44 }; 45 46 template <> 47 class MakeUnsignedChar<char32_t> { 48 public: 49 using Type = char32_t; 50 }; 51 52 } // namespace detail 53 54 /** 55 * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range 56 * [0x80, 0xFF]. 57 */ 58 template <typename Char> 59 constexpr bool IsNonAsciiLatin1(Char aChar) { 60 using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type; 61 auto uc = static_cast<UnsignedChar>(aChar); 62 return uc >= 0x80 && uc <= 0xFF; 63 } 64 65 #if MOZ_HAS_JSRUST() 66 67 /** 68 * Returns |true| iff |aString| contains only Latin1 characters, that is, 69 * characters in the range [U+0000, U+00FF]. 70 * 71 * @param aString a potentially-invalid UTF-16 string to scan 72 */ 73 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) { 74 size_t length = aString.Length(); 75 const char16_t* ptr = aString.Elements(); 76 // For short strings, calling into Rust is a pessimization, and the SIMD 77 // code won't have a chance to kick in anyway. 78 // 16 is a bit larger than logically necessary for this function alone, 79 // but it's important that the limit here matches the limit used in 80 // LossyConvertUtf16toLatin1! 81 if (length < mozilla::detail::kShortStringLimitForInlinePaths) { 82 char16_t accu = 0; 83 for (size_t i = 0; i < length; i++) { 84 accu |= ptr[i]; 85 } 86 return accu < 0x100; 87 } 88 return encoding_mem_is_utf16_latin1(ptr, length); 89 } 90 91 /** 92 * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1 93 * characters. 94 * 95 * If you know that the argument is always absolutely guaranteed to be valid 96 * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead. 97 * 98 * @param aString potentially-invalid UTF-8 string to scan 99 */ 100 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) { 101 return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length()); 102 } 103 104 /** 105 * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only 106 * Latin1 characters, that is, characters in the range [U+0000, U+00FF]. 107 * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.) 108 * 109 * @param aString known-valid UTF-8 string to scan 110 */ 111 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) { 112 return encoding_mem_is_str_latin1(aString.Elements(), aString.Length()); 113 } 114 115 /** 116 * Returns the index of first byte that starts an invalid byte 117 * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8 118 * string, or the length of the string if there are neither. 119 * 120 * If you know that the argument is always absolutely guaranteed to be valid 121 * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead. 122 * 123 * @param aString potentially-invalid UTF-8 string to scan 124 */ 125 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) { 126 return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length()); 127 } 128 129 /** 130 * Returns the index of first byte that starts a non-Latin1 byte 131 * sequence in a known-valid UTF-8 string, or the length of the 132 * string if there are none. (If the string might not be valid 133 * UTF-8, use Utf8Latin1UpTo() instead.) 134 * 135 * @param aString known-valid UTF-8 string to scan 136 */ 137 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) { 138 return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length()); 139 } 140 141 /** 142 * If all the code points in the input are below U+0100, converts to Latin1, 143 * i.e. unsigned byte value is Unicode scalar value. If there are code points 144 * above U+00FF, produces unspecified garbage in a memory-safe way. The 145 * nature of the garbage must not be relied upon. 146 * 147 * The length of aDest must not be less than the length of aSource. 148 */ 149 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource, 150 mozilla::Span<char> aDest) { 151 const char16_t* srcPtr = aSource.Elements(); 152 size_t srcLen = aSource.Length(); 153 char* dstPtr = aDest.Elements(); 154 size_t dstLen = aDest.Length(); 155 // Avoid function call overhead when SIMD isn't used anyway 156 // If you change the length limit here, be sure to change 157 // IsUtf16Latin1 and IsAscii to match so that optimizations don't 158 // fail! 159 if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) { 160 MOZ_ASSERT(dstLen >= srcLen); 161 uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr); 162 const char16_t* end = srcPtr + srcLen; 163 while (srcPtr < end) { 164 *unsignedPtr = static_cast<uint8_t>(*srcPtr); 165 ++srcPtr; 166 ++unsignedPtr; 167 } 168 return; 169 } 170 encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen); 171 } 172 173 /** 174 * If all the code points in the input are below U+0100, converts to Latin1, 175 * i.e. unsigned byte value is Unicode scalar value. If there are code points 176 * above U+00FF, produces unspecified garbage in a memory-safe way. The 177 * nature of the garbage must not be relied upon. 178 * 179 * Returns the number of code units written. 180 * 181 * The length of aDest must not be less than the length of aSource. 182 */ 183 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource, 184 mozilla::Span<char> aDest) { 185 return encoding_mem_convert_utf8_to_latin1_lossy( 186 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); 187 } 188 189 /** 190 * Converts each byte of |aSource|, interpreted as a Unicode scalar value 191 * having that unsigned value, to its UTF-8 representation in |aDest|. 192 * 193 * Returns the number of code units written. 194 * 195 * The length of aDest must be at least twice the length of aSource. 196 */ 197 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource, 198 mozilla::Span<char> aDest) { 199 return encoding_mem_convert_latin1_to_utf8( 200 aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); 201 } 202 203 /** 204 * Converts bytes whose unsigned value is interpreted as Unicode code point 205 * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient 206 * output space. 207 * 208 * Returns the number of bytes read and the number of bytes written. 209 * 210 * If the output isn't large enough, not all input is consumed. 211 * 212 * The conversion is guaranteed to be complete if the length of aDest is 213 * at least the length of aSource times two. 214 * 215 * The output is always valid UTF-8 ending on scalar value boundary 216 * even in the case of partial conversion. 217 * 218 * The semantics of this function match the semantics of 219 * TextEncoder.encodeInto. 220 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto 221 */ 222 inline std::tuple<size_t, size_t> ConvertLatin1toUtf8Partial( 223 mozilla::Span<const char> aSource, mozilla::Span<char> aDest) { 224 size_t srcLen = aSource.Length(); 225 size_t dstLen = aDest.Length(); 226 encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen, 227 aDest.Elements(), &dstLen); 228 return std::make_tuple(srcLen, dstLen); 229 } 230 231 /** 232 * Converts Latin-1 code points (i.e. each byte is the identical code 233 * point) from |aSource| to UTF-16 code points in |aDest|. 234 * 235 * The length of aDest must not be less than the length of aSource. 236 */ 237 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource, 238 mozilla::Span<char16_t> aDest) { 239 const char* srcPtr = aSource.Elements(); 240 size_t srcLen = aSource.Length(); 241 char16_t* dstPtr = aDest.Elements(); 242 size_t dstLen = aDest.Length(); 243 // Avoid function call overhead when SIMD isn't used anyway 244 if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) { 245 MOZ_ASSERT(dstLen >= srcLen); 246 const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr); 247 const uint8_t* end = unsignedPtr + srcLen; 248 while (unsignedPtr < end) { 249 *dstPtr = *unsignedPtr; 250 ++unsignedPtr; 251 ++dstPtr; 252 } 253 return; 254 } 255 encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen); 256 } 257 258 #endif 259 260 }; // namespace mozilla 261 262 #endif // mozilla_Latin1_h