tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Latin1.h (9028B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
      6 * (Note: this is *not* the same as the encoding of windows-1252 or
      7 * latin1 content on the web. In Web terms, this encoding
      8 * corresponds to "isomorphic decode" / "isomorphic encoding" from
      9 * the Infra Standard.)
     10 */
     11 
     12 #ifndef mozilla_Latin1_h
     13 #define mozilla_Latin1_h
     14 
     15 #include <type_traits>
     16 
     17 #include "mozilla/JsRust.h"
     18 #include "mozilla/Span.h"
     19 
     20 #if MOZ_HAS_JSRUST()
     21 #  include "encoding_rs_mem.h"
     22 #endif
     23 
     24 namespace mozilla {
     25 
     26 namespace detail {
     27 
     28 // It's important for optimizations that Latin1ness checks
     29 // and inflation/deflation function use the same short
     30 // string limit. The limit is 16, because that's the shortest
     31 // that inflates/deflates using SIMD.
     32 constexpr size_t kShortStringLimitForInlinePaths = 16;
     33 
     34 template <typename Char>
     35 class MakeUnsignedChar {
     36 public:
     37  using Type = std::make_unsigned_t<Char>;
     38 };
     39 
     40 template <>
     41 class MakeUnsignedChar<char16_t> {
     42 public:
     43  using Type = char16_t;
     44 };
     45 
     46 template <>
     47 class MakeUnsignedChar<char32_t> {
     48 public:
     49  using Type = char32_t;
     50 };
     51 
     52 }  // namespace detail
     53 
     54 /**
     55 * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
     56 * [0x80, 0xFF].
     57 */
     58 template <typename Char>
     59 constexpr bool IsNonAsciiLatin1(Char aChar) {
     60  using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
     61  auto uc = static_cast<UnsignedChar>(aChar);
     62  return uc >= 0x80 && uc <= 0xFF;
     63 }
     64 
     65 #if MOZ_HAS_JSRUST()
     66 
     67 /**
     68 * Returns |true| iff |aString| contains only Latin1 characters, that is,
     69 * characters in the range [U+0000, U+00FF].
     70 *
     71 * @param aString a potentially-invalid UTF-16 string to scan
     72 */
     73 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) {
     74  size_t length = aString.Length();
     75  const char16_t* ptr = aString.Elements();
     76  // For short strings, calling into Rust is a pessimization, and the SIMD
     77  // code won't have a chance to kick in anyway.
     78  // 16 is a bit larger than logically necessary for this function alone,
     79  // but it's important that the limit here matches the limit used in
     80  // LossyConvertUtf16toLatin1!
     81  if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
     82    char16_t accu = 0;
     83    for (size_t i = 0; i < length; i++) {
     84      accu |= ptr[i];
     85    }
     86    return accu < 0x100;
     87  }
     88  return encoding_mem_is_utf16_latin1(ptr, length);
     89 }
     90 
     91 /**
     92 * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
     93 * characters.
     94 *
     95 * If you know that the argument is always absolutely guaranteed to be valid
     96 * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
     97 *
     98 * @param aString potentially-invalid UTF-8 string to scan
     99 */
    100 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) {
    101  return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
    102 }
    103 
    104 /**
    105 * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
    106 * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
    107 * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
    108 *
    109 * @param aString known-valid UTF-8 string to scan
    110 */
    111 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) {
    112  return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
    113 }
    114 
    115 /**
    116 * Returns the index of first byte that starts an invalid byte
    117 * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
    118 * string, or the length of the string if there are neither.
    119 *
    120 * If you know that the argument is always absolutely guaranteed to be valid
    121 * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
    122 *
    123 * @param aString potentially-invalid UTF-8 string to scan
    124 */
    125 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) {
    126  return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length());
    127 }
    128 
    129 /**
    130 * Returns the index of first byte that starts a non-Latin1 byte
    131 * sequence in a known-valid UTF-8 string, or the length of the
    132 * string if there are none. (If the string might not be valid
    133 * UTF-8, use Utf8Latin1UpTo() instead.)
    134 *
    135 * @param aString known-valid UTF-8 string to scan
    136 */
    137 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) {
    138  return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length());
    139 }
    140 
    141 /**
    142 * If all the code points in the input are below U+0100, converts to Latin1,
    143 * i.e. unsigned byte value is Unicode scalar value. If there are code points
    144 * above U+00FF, produces unspecified garbage in a memory-safe way. The
    145 * nature of the garbage must not be relied upon.
    146 *
    147 * The length of aDest must not be less than the length of aSource.
    148 */
    149 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,
    150                                      mozilla::Span<char> aDest) {
    151  const char16_t* srcPtr = aSource.Elements();
    152  size_t srcLen = aSource.Length();
    153  char* dstPtr = aDest.Elements();
    154  size_t dstLen = aDest.Length();
    155  // Avoid function call overhead when SIMD isn't used anyway
    156  // If you change the length limit here, be sure to change
    157  // IsUtf16Latin1 and IsAscii to match so that optimizations don't
    158  // fail!
    159  if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
    160    MOZ_ASSERT(dstLen >= srcLen);
    161    uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr);
    162    const char16_t* end = srcPtr + srcLen;
    163    while (srcPtr < end) {
    164      *unsignedPtr = static_cast<uint8_t>(*srcPtr);
    165      ++srcPtr;
    166      ++unsignedPtr;
    167    }
    168    return;
    169  }
    170  encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen);
    171 }
    172 
    173 /**
    174 * If all the code points in the input are below U+0100, converts to Latin1,
    175 * i.e. unsigned byte value is Unicode scalar value. If there are code points
    176 * above U+00FF, produces unspecified garbage in a memory-safe way. The
    177 * nature of the garbage must not be relied upon.
    178 *
    179 * Returns the number of code units written.
    180 *
    181 * The length of aDest must not be less than the length of aSource.
    182 */
    183 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,
    184                                       mozilla::Span<char> aDest) {
    185  return encoding_mem_convert_utf8_to_latin1_lossy(
    186      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
    187 }
    188 
    189 /**
    190 * Converts each byte of |aSource|, interpreted as a Unicode scalar value
    191 * having that unsigned value, to its UTF-8 representation in |aDest|.
    192 *
    193 * Returns the number of code units written.
    194 *
    195 * The length of aDest must be at least twice the length of aSource.
    196 */
    197 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource,
    198                                  mozilla::Span<char> aDest) {
    199  return encoding_mem_convert_latin1_to_utf8(
    200      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
    201 }
    202 
    203 /**
    204 * Converts bytes whose unsigned value is interpreted as Unicode code point
    205 * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
    206 * output space.
    207 *
    208 * Returns the number of bytes read and the number of bytes written.
    209 *
    210 * If the output isn't large enough, not all input is consumed.
    211 *
    212 * The conversion is guaranteed to be complete if the length of aDest is
    213 * at least the length of aSource times two.
    214 *
    215 * The output is always valid UTF-8 ending on scalar value boundary
    216 * even in the case of partial conversion.
    217 *
    218 * The semantics of this function match the semantics of
    219 * TextEncoder.encodeInto.
    220 * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
    221 */
    222 inline std::tuple<size_t, size_t> ConvertLatin1toUtf8Partial(
    223    mozilla::Span<const char> aSource, mozilla::Span<char> aDest) {
    224  size_t srcLen = aSource.Length();
    225  size_t dstLen = aDest.Length();
    226  encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen,
    227                                              aDest.Elements(), &dstLen);
    228  return std::make_tuple(srcLen, dstLen);
    229 }
    230 
    231 /**
    232 * Converts Latin-1 code points (i.e. each byte is the identical code
    233 * point) from |aSource| to UTF-16 code points in |aDest|.
    234 *
    235 * The length of aDest must not be less than the length of aSource.
    236 */
    237 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource,
    238                                 mozilla::Span<char16_t> aDest) {
    239  const char* srcPtr = aSource.Elements();
    240  size_t srcLen = aSource.Length();
    241  char16_t* dstPtr = aDest.Elements();
    242  size_t dstLen = aDest.Length();
    243  // Avoid function call overhead when SIMD isn't used anyway
    244  if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
    245    MOZ_ASSERT(dstLen >= srcLen);
    246    const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr);
    247    const uint8_t* end = unsignedPtr + srcLen;
    248    while (unsignedPtr < end) {
    249      *dstPtr = *unsignedPtr;
    250      ++unsignedPtr;
    251      ++dstPtr;
    252    }
    253    return;
    254  }
    255  encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen);
    256 }
    257 
    258 #endif
    259 
    260 };  // namespace mozilla
    261 
    262 #endif  // mozilla_Latin1_h