tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf_string_conversion_utils.h (5238B)


      1 // Copyright 2011 The Chromium Authors
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
      6 #define BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_
      7 
      8 // Low-level UTF handling functions. Most code will want to use the functions
      9 // in utf_string_conversions.h
     10 
     11 #include <stddef.h>
     12 #include <stdint.h>
     13 
     14 #include <limits>
     15 #include <string>
     16 #include <string_view>
     17 
     18 #include "base/base_export.h"
     19 #include "base/third_party/icu/icu_utf.h"
     20 #include "build/build_config.h"
     21 #include "third_party/abseil-cpp/absl/types/optional.h"
     22 
     23 namespace base {
     24 
     25 inline bool IsValidCodepoint(base_icu::UChar32 code_point) {
     26  // Excludes code points that are not Unicode scalar values, i.e.
     27  // surrogate code points ([0xD800, 0xDFFF]). Additionally, excludes
     28  // code points larger than 0x10FFFF (the highest codepoint allowed).
     29  // Non-characters and unassigned code points are allowed.
     30  // https://unicode.org/glossary/#unicode_scalar_value
     31  return (code_point >= 0 && code_point < 0xD800) ||
     32         (code_point >= 0xE000 && code_point <= 0x10FFFF);
     33 }
     34 
     35 inline bool IsValidCharacter(base_icu::UChar32 code_point) {
     36  // Excludes non-characters (U+FDD0..U+FDEF, and all code points
     37  // ending in 0xFFFE or 0xFFFF) from the set of valid code points.
     38  // https://unicode.org/faq/private_use.html#nonchar1
     39  return (code_point >= 0 && code_point < 0xD800) ||
     40         (code_point >= 0xE000 && code_point < 0xFDD0) ||
     41         (code_point > 0xFDEF && code_point <= 0x10FFFF &&
     42          (code_point & 0xFFFE) != 0xFFFE);
     43 }
     44 
     45 // CountUnicodeCharacters ------------------------------------------------------
     46 
     47 // Returns the number of Unicode characters in `text`, up to the supplied
     48 // `limit`, if `text` contains valid UTF-8. Returns `nullopt` otherwise.
     49 BASE_EXPORT absl::optional<size_t> CountUnicodeCharacters(
     50    std::string_view text,
     51    size_t limit = std::numeric_limits<size_t>::max());
     52 
     53 // ReadUnicodeCharacter --------------------------------------------------------
     54 
     55 // Reads a UTF-8 stream, placing the next code point into the given output
     56 // |*code_point|. |src| represents the entire string to read, and |*char_index|
     57 // is the character offset within the string to start reading at. |*char_index|
     58 // will be updated to index the last character read, such that incrementing it
     59 // (as in a for loop) will take the reader to the next character.
     60 //
     61 // Returns true on success. On false, |*code_point| will be invalid.
     62 BASE_EXPORT bool ReadUnicodeCharacter(const char* src,
     63                                      size_t src_len,
     64                                      size_t* char_index,
     65                                      base_icu::UChar32* code_point_out);
     66 
     67 // Reads a UTF-16 character. The usage is the same as the 8-bit version above.
     68 BASE_EXPORT bool ReadUnicodeCharacter(const char16_t* src,
     69                                      size_t src_len,
     70                                      size_t* char_index,
     71                                      base_icu::UChar32* code_point);
     72 
     73 #if defined(WCHAR_T_IS_UTF32)
     74 // Reads UTF-32 character. The usage is the same as the 8-bit version above.
     75 BASE_EXPORT bool ReadUnicodeCharacter(const wchar_t* src,
     76                                      size_t src_len,
     77                                      size_t* char_index,
     78                                      base_icu::UChar32* code_point);
     79 #endif  // defined(WCHAR_T_IS_UTF32)
     80 
     81 // WriteUnicodeCharacter -------------------------------------------------------
     82 
     83 // Appends a UTF-8 character to the given 8-bit string.  Returns the number of
     84 // bytes written.
     85 BASE_EXPORT size_t WriteUnicodeCharacter(base_icu::UChar32 code_point,
     86                                         std::string* output);
     87 
     88 // Appends the given code point as a UTF-16 character to the given 16-bit
     89 // string.  Returns the number of 16-bit values written.
     90 BASE_EXPORT size_t WriteUnicodeCharacter(base_icu::UChar32 code_point,
     91                                         std::u16string* output);
     92 
     93 #if defined(WCHAR_T_IS_UTF32)
     94 // Appends the given UTF-32 character to the given 32-bit string.  Returns the
     95 // number of 32-bit values written.
     96 inline size_t WriteUnicodeCharacter(base_icu::UChar32 code_point,
     97                                    std::wstring* output) {
     98  // This is the easy case, just append the character.
     99  output->push_back(static_cast<wchar_t>(code_point));
    100  return 1;
    101 }
    102 #endif  // defined(WCHAR_T_IS_UTF32)
    103 
    104 // Generalized Unicode converter -----------------------------------------------
    105 
    106 // Guesses the length of the output in UTF-8 in bytes, clears that output
    107 // string, and reserves that amount of space.  We assume that the input
    108 // character types are unsigned, which will be true for UTF-16 and -32 on our
    109 // systems.
    110 template<typename CHAR>
    111 void PrepareForUTF8Output(const CHAR* src, size_t src_len, std::string* output);
    112 
    113 // Prepares an output buffer (containing either UTF-16 or -32 data) given some
    114 // UTF-8 input that will be converted to it.  See PrepareForUTF8Output().
    115 template<typename STRING>
    116 void PrepareForUTF16Or32Output(const char* src, size_t src_len, STRING* output);
    117 
    118 }  // namespace base
    119 
    120 #endif  // BASE_STRINGS_UTF_STRING_CONVERSION_UTILS_H_