utfstring.h (5011B)
1 // © 2025 and later: Unicode, Inc. and others. 2 // License & terms of use: https://www.unicode.org/copyright.html 3 4 // utfstring.h 5 // created: 2025jul18 Markus W. Scherer 6 7 #ifndef __UTFSTRING_H__ 8 #define __UTFSTRING_H__ 9 10 #include "unicode/utypes.h" 11 12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H) 13 14 #include "unicode/utf16.h" 15 16 /** 17 * \file 18 * \brief C++ header-only API: C++ string helper functions. 19 */ 20 21 #ifndef U_HIDE_DRAFT_API 22 23 namespace U_HEADER_ONLY_NAMESPACE { 24 namespace utfstring { 25 26 // Write code points to strings -------------------------------------------- *** 27 28 #ifndef U_IN_DOXYGEN 29 namespace prv { 30 31 // This function, and the public wrappers, 32 // want to be U_FORCE_INLINE but the gcc-debug-build-and-test CI check failed with 33 // error: ‘always_inline’ function might not be inlinable [-Werror=attributes] 34 template<typename StringClass, bool validate> 35 inline StringClass &appendCodePoint(StringClass &s, uint32_t c) { 36 using Unit = typename StringClass::value_type; 37 if constexpr (sizeof(Unit) == 1) { 38 // UTF-8: Similar to U8_APPEND(). 39 if (c <= 0x7f) { 40 s.push_back(static_cast<Unit>(c)); 41 } else { 42 Unit buf[4]; 43 uint8_t len; 44 if (c <= 0x7ff) { 45 len = 2; 46 buf[2] = (c >> 6) | 0xc0; 47 } else { 48 if (validate ? 49 c < 0xd800 || 50 (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) : 51 c <= 0xffff) { 52 len = 3; 53 buf[1] = (c >> 12) | 0xe0; 54 } else { 55 len = 4; 56 buf[0] = (c >> 18) | 0xf0; 57 buf[1] = ((c >> 12) & 0x3f) | 0x80; 58 } 59 buf[2] = ((c >> 6) & 0x3f) | 0x80; 60 } 61 buf[3] = (c & 0x3f) | 0x80; 62 s.append(buf + 4 - len, len); 63 } 64 } else if constexpr (sizeof(Unit) == 2) { 65 // UTF-16: Similar to U16_APPEND(). 66 if (validate ? 67 c < 0xd800 || (c < 0xe000 || c > 0x10ffff ? (c = 0xfffd, true) : c <= 0xffff) : 68 c <= 0xffff) { 69 s.push_back(static_cast<Unit>(c)); 70 } else { 71 Unit buf[2] = { U16_LEAD(c), U16_TRAIL(c) }; 72 s.append(buf, 2); 73 } 74 } else { 75 // UTF-32 76 s.push_back(!validate || U_IS_SCALAR_VALUE(c) ? c : 0xfffd); 77 } 78 return s; 79 } 80 81 } // namespace prv 82 #endif // U_IN_DOXYGEN 83 84 #ifndef U_HIDE_DRAFT_API 85 /** 86 * Appends the code point to the string. 87 * Appends the U+FFFD replacement character instead if c is not a scalar value. 88 * See https://www.unicode.org/glossary/#unicode_scalar_value 89 * 90 * @tparam StringClass A version of std::basic_string (or a compatible type) 91 * @param s The string to append to 92 * @param c The code point to append 93 * @return s 94 * @draft ICU 78 95 * @see U_IS_SCALAR_VALUE 96 */ 97 template<typename StringClass> 98 inline StringClass &appendOrFFFD(StringClass &s, UChar32 c) { 99 return prv::appendCodePoint<StringClass, true>(s, c); 100 } 101 102 /** 103 * Appends the code point to the string. 104 * The code point must be a scalar value; otherwise the behavior is undefined. 105 * See https://www.unicode.org/glossary/#unicode_scalar_value 106 * 107 * @tparam StringClass A version of std::basic_string (or a compatible type) 108 * @param s The string to append to 109 * @param c The code point to append (must be a scalar value) 110 * @return s 111 * @draft ICU 78 112 * @see U_IS_SCALAR_VALUE 113 */ 114 template<typename StringClass> 115 inline StringClass &appendUnsafe(StringClass &s, UChar32 c) { 116 return prv::appendCodePoint<StringClass, false>(s, c); 117 } 118 119 /** 120 * Returns the code point as a string of code units. 121 * Returns the U+FFFD replacement character instead if c is not a scalar value. 122 * See https://www.unicode.org/glossary/#unicode_scalar_value 123 * 124 * @tparam StringClass A version of std::basic_string (or a compatible type) 125 * @param c The code point 126 * @return the string of c's code units 127 * @draft ICU 78 128 * @see U_IS_SCALAR_VALUE 129 */ 130 template<typename StringClass> 131 inline StringClass encodeOrFFFD(UChar32 c) { 132 StringClass s; 133 prv::appendCodePoint<StringClass, true>(s, c); 134 return s; 135 } 136 137 /** 138 * Returns the code point as a string of code units. 139 * The code point must be a scalar value; otherwise the behavior is undefined. 140 * See https://www.unicode.org/glossary/#unicode_scalar_value 141 * 142 * @tparam StringClass A version of std::basic_string (or a compatible type) 143 * @param c The code point 144 * @return the string of c's code units 145 * @draft ICU 78 146 * @see U_IS_SCALAR_VALUE 147 */ 148 template<typename StringClass> 149 inline StringClass encodeUnsafe(UChar32 c) { 150 StringClass s; 151 prv::appendCodePoint<StringClass, false>(s, c); 152 return s; 153 } 154 #endif // U_HIDE_DRAFT_API 155 156 } // namespace utfstring 157 } // namespace U_HEADER_ONLY_NAMESPACE 158 159 #endif // U_HIDE_DRAFT_API 160 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API 161 #endif // __UTFSTRING_H__