tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf_string_conversions.cc (10827B)


      1 // Copyright 2018 The Chromium Authors
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/strings/utf_string_conversions.h"
      6 
      7 #include <limits.h>
      8 #include <stdint.h>
      9 
     10 #include <ostream>
     11 #include <type_traits>
     12 
     13 #include "base/strings/string_piece.h"
     14 #include "base/strings/string_util.h"
     15 #include "base/strings/utf_ostream_operators.h"
     16 #include "base/strings/utf_string_conversion_utils.h"
     17 #include "base/third_party/icu/icu_utf.h"
     18 #include "build/build_config.h"
     19 
     20 namespace base {
     21 
     22 namespace {
     23 
     24 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
     25 
     26 // Size coefficient ----------------------------------------------------------
     27 // The maximum number of codeunits in the destination encoding corresponding to
     28 // one codeunit in the source encoding.
     29 
     30 template <typename SrcChar, typename DestChar>
     31 struct SizeCoefficient {
     32  static_assert(sizeof(SrcChar) < sizeof(DestChar),
     33                "Default case: from a smaller encoding to the bigger one");
     34 
     35  // ASCII symbols are encoded by one codeunit in all encodings.
     36  static constexpr int value = 1;
     37 };
     38 
     39 template <>
     40 struct SizeCoefficient<char16_t, char> {
     41  // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
     42  static constexpr int value = 3;
     43 };
     44 
     45 #if defined(WCHAR_T_IS_UTF32)
     46 template <>
     47 struct SizeCoefficient<wchar_t, char> {
     48  // UTF-8 uses at most 4 codeunits per character.
     49  static constexpr int value = 4;
     50 };
     51 
     52 template <>
     53 struct SizeCoefficient<wchar_t, char16_t> {
     54  // UTF-16 uses at most 2 codeunits per character.
     55  static constexpr int value = 2;
     56 };
     57 #endif  // defined(WCHAR_T_IS_UTF32)
     58 
     59 template <typename SrcChar, typename DestChar>
     60 constexpr int size_coefficient_v =
     61    SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
     62 
     63 // UnicodeAppendUnsafe --------------------------------------------------------
     64 // Function overloads that write code_point to the output string. Output string
     65 // has to have enough space for the codepoint.
     66 
     67 // Convenience typedef that checks whether the passed in type is integral (i.e.
     68 // bool, char, int or their extended versions) and is of the correct size.
     69 template <typename Char, size_t N>
     70 using EnableIfBitsAre =
     71    std::enable_if_t<std::is_integral_v<Char> && CHAR_BIT * sizeof(Char) == N,
     72                     bool>;
     73 
     74 template <typename Char, EnableIfBitsAre<Char, 8> = true>
     75 void UnicodeAppendUnsafe(Char* out,
     76                         size_t* size,
     77                         base_icu::UChar32 code_point) {
     78  CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
     79 }
     80 
     81 template <typename Char, EnableIfBitsAre<Char, 16> = true>
     82 void UnicodeAppendUnsafe(Char* out,
     83                         size_t* size,
     84                         base_icu::UChar32 code_point) {
     85  CBU16_APPEND_UNSAFE(out, *size, code_point);
     86 }
     87 
     88 template <typename Char, EnableIfBitsAre<Char, 32> = true>
     89 void UnicodeAppendUnsafe(Char* out,
     90                         size_t* size,
     91                         base_icu::UChar32 code_point) {
     92  out[(*size)++] = static_cast<Char>(code_point);
     93 }
     94 
     95 // DoUTFConversion ------------------------------------------------------------
     96 // Main driver of UTFConversion specialized for different Src encodings.
     97 // dest has to have enough room for the converted text.
     98 
     99 template <typename DestChar>
    100 bool DoUTFConversion(const char* src,
    101                     size_t src_len,
    102                     DestChar* dest,
    103                     size_t* dest_len) {
    104  bool success = true;
    105 
    106  for (size_t i = 0; i < src_len;) {
    107    base_icu::UChar32 code_point;
    108    CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
    109 
    110    if (!IsValidCodepoint(code_point)) {
    111      success = false;
    112      code_point = kErrorCodePoint;
    113    }
    114 
    115    UnicodeAppendUnsafe(dest, dest_len, code_point);
    116  }
    117 
    118  return success;
    119 }
    120 
    121 template <typename DestChar>
    122 bool DoUTFConversion(const char16_t* src,
    123                     size_t src_len,
    124                     DestChar* dest,
    125                     size_t* dest_len) {
    126  bool success = true;
    127 
    128  auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
    129    if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
    130      success = false;
    131      return kErrorCodePoint;
    132    }
    133    return in;
    134  };
    135 
    136  size_t i = 0;
    137 
    138  // Always have another symbol in order to avoid checking boundaries in the
    139  // middle of the surrogate pair.
    140  while (i + 1 < src_len) {
    141    base_icu::UChar32 code_point;
    142 
    143    if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
    144      code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
    145      if (!IsValidCodepoint(code_point)) {
    146        code_point = kErrorCodePoint;
    147        success = false;
    148      }
    149      i += 2;
    150    } else {
    151      code_point = ConvertSingleChar(src[i]);
    152      ++i;
    153    }
    154 
    155    UnicodeAppendUnsafe(dest, dest_len, code_point);
    156  }
    157 
    158  if (i < src_len) {
    159    UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
    160  }
    161 
    162  return success;
    163 }
    164 
    165 #if defined(WCHAR_T_IS_UTF32)
    166 
    167 template <typename DestChar>
    168 bool DoUTFConversion(const wchar_t* src,
    169                     size_t src_len,
    170                     DestChar* dest,
    171                     size_t* dest_len) {
    172  bool success = true;
    173 
    174  for (size_t i = 0; i < src_len; ++i) {
    175    auto code_point = static_cast<base_icu::UChar32>(src[i]);
    176 
    177    if (!IsValidCodepoint(code_point)) {
    178      success = false;
    179      code_point = kErrorCodePoint;
    180    }
    181 
    182    UnicodeAppendUnsafe(dest, dest_len, code_point);
    183  }
    184 
    185  return success;
    186 }
    187 
    188 #endif  // defined(WCHAR_T_IS_UTF32)
    189 
    190 // UTFConversion --------------------------------------------------------------
    191 // Function template for generating all UTF conversions.
    192 
    193 template <typename InputString, typename DestString>
    194 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
    195  if (IsStringASCII(src_str)) {
    196    dest_str->assign(src_str.begin(), src_str.end());
    197    return true;
    198  }
    199 
    200  dest_str->resize(src_str.length() *
    201                   size_coefficient_v<typename InputString::value_type,
    202                                      typename DestString::value_type>);
    203 
    204  // Empty string is ASCII => it OK to call operator[].
    205  auto* dest = &(*dest_str)[0];
    206 
    207  // ICU requires 32 bit numbers.
    208  size_t src_len = src_str.length();
    209  size_t dest_len = 0;
    210 
    211  bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
    212 
    213  dest_str->resize(dest_len);
    214  dest_str->shrink_to_fit();
    215 
    216  return res;
    217 }
    218 
    219 }  // namespace
    220 
    221 // UTF16 <-> UTF8 --------------------------------------------------------------
    222 
    223 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
    224  return UTFConversion(StringPiece(src, src_len), output);
    225 }
    226 
    227 std::u16string UTF8ToUTF16(StringPiece utf8) {
    228  std::u16string ret;
    229  // Ignore the success flag of this call, it will do the best it can for
    230  // invalid input, which is what we want here.
    231  UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
    232  return ret;
    233 }
    234 
    235 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
    236  return UTFConversion(StringPiece16(src, src_len), output);
    237 }
    238 
    239 std::string UTF16ToUTF8(StringPiece16 utf16) {
    240  std::string ret;
    241  // Ignore the success flag of this call, it will do the best it can for
    242  // invalid input, which is what we want here.
    243  UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
    244  return ret;
    245 }
    246 
    247 // UTF-16 <-> Wide -------------------------------------------------------------
    248 
    249 #if defined(WCHAR_T_IS_UTF16)
    250 // When wide == UTF-16 the conversions are a NOP.
    251 
    252 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
    253  output->assign(src, src + src_len);
    254  return true;
    255 }
    256 
    257 std::u16string WideToUTF16(WStringPiece wide) {
    258  return std::u16string(wide.begin(), wide.end());
    259 }
    260 
    261 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
    262  output->assign(src, src + src_len);
    263  return true;
    264 }
    265 
    266 std::wstring UTF16ToWide(StringPiece16 utf16) {
    267  return std::wstring(utf16.begin(), utf16.end());
    268 }
    269 
    270 #elif defined(WCHAR_T_IS_UTF32)
    271 
    272 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
    273  return UTFConversion(base::WStringPiece(src, src_len), output);
    274 }
    275 
    276 std::u16string WideToUTF16(WStringPiece wide) {
    277  std::u16string ret;
    278  // Ignore the success flag of this call, it will do the best it can for
    279  // invalid input, which is what we want here.
    280  WideToUTF16(wide.data(), wide.length(), &ret);
    281  return ret;
    282 }
    283 
    284 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
    285  return UTFConversion(StringPiece16(src, src_len), output);
    286 }
    287 
    288 std::wstring UTF16ToWide(StringPiece16 utf16) {
    289  std::wstring ret;
    290  // Ignore the success flag of this call, it will do the best it can for
    291  // invalid input, which is what we want here.
    292  UTF16ToWide(utf16.data(), utf16.length(), &ret);
    293  return ret;
    294 }
    295 
    296 #endif  // defined(WCHAR_T_IS_UTF32)
    297 
    298 // UTF-8 <-> Wide --------------------------------------------------------------
    299 
    300 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
    301 
    302 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
    303  return UTFConversion(StringPiece(src, src_len), output);
    304 }
    305 
    306 std::wstring UTF8ToWide(StringPiece utf8) {
    307  std::wstring ret;
    308  // Ignore the success flag of this call, it will do the best it can for
    309  // invalid input, which is what we want here.
    310  UTF8ToWide(utf8.data(), utf8.length(), &ret);
    311  return ret;
    312 }
    313 
    314 #if defined(WCHAR_T_IS_UTF16)
    315 // Easy case since we can use the "utf" versions we already wrote above.
    316 
    317 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
    318  return UTF16ToUTF8(as_u16cstr(src), src_len, output);
    319 }
    320 
    321 std::string WideToUTF8(WStringPiece wide) {
    322  return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
    323 }
    324 
    325 #elif defined(WCHAR_T_IS_UTF32)
    326 
    327 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
    328  return UTFConversion(WStringPiece(src, src_len), output);
    329 }
    330 
    331 std::string WideToUTF8(WStringPiece wide) {
    332  std::string ret;
    333  // Ignore the success flag of this call, it will do the best it can for
    334  // invalid input, which is what we want here.
    335  WideToUTF8(wide.data(), wide.length(), &ret);
    336  return ret;
    337 }
    338 
    339 #endif  // defined(WCHAR_T_IS_UTF32)
    340 
    341 std::u16string ASCIIToUTF16(StringPiece ascii) {
    342  DCHECK(IsStringASCII(ascii)) << ascii;
    343  return std::u16string(ascii.begin(), ascii.end());
    344 }
    345 
    346 std::string UTF16ToASCII(StringPiece16 utf16) {
    347  DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
    348  return std::string(utf16.begin(), utf16.end());
    349 }
    350 
    351 #if defined(WCHAR_T_IS_UTF16)
    352 std::wstring ASCIIToWide(StringPiece ascii) {
    353  DCHECK(IsStringASCII(ascii)) << ascii;
    354  return std::wstring(ascii.begin(), ascii.end());
    355 }
    356 
    357 std::string WideToASCII(WStringPiece wide) {
    358  DCHECK(IsStringASCII(wide)) << wide;
    359  return std::string(wide.begin(), wide.end());
    360 }
    361 #endif  // defined(WCHAR_T_IS_UTF16)
    362 
    363 }  // namespace base