String.h (8320B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #ifndef intl_components_String_h_ 6 #define intl_components_String_h_ 7 8 #include "mozilla/Assertions.h" 9 #include "mozilla/Casting.h" 10 #include "mozilla/intl/ICU4CGlue.h" 11 #include "mozilla/intl/ICUError.h" 12 #include "mozilla/PodOperations.h" 13 #include "mozilla/Span.h" 14 #include "mozilla/Try.h" 15 16 #include "unicode/uchar.h" 17 #include "unicode/unorm2.h" 18 #include "unicode/ustring.h" 19 #include "unicode/utext.h" 20 #include "unicode/utypes.h" 21 22 namespace mozilla::intl { 23 24 /** 25 * This component is a Mozilla-focused API for working with strings in 26 * internationalization code. 27 */ 28 class String final { 29 public: 30 String() = delete; 31 32 /** 33 * Return the locale-sensitive lower case string of the input. 34 */ 35 template <typename B> 36 static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale, 37 Span<const char16_t> aString, 38 B& aBuffer) { 39 if (!aBuffer.reserve(aString.size())) { 40 return Err(ICUError::OutOfMemory); 41 } 42 return FillBufferWithICUCall( 43 aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { 44 return u_strToLower(target, length, aString.data(), aString.size(), 45 aLocale, status); 46 }); 47 } 48 49 /** 50 * Return the locale-sensitive upper case string of the input. 51 */ 52 template <typename B> 53 static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale, 54 Span<const char16_t> aString, 55 B& aBuffer) { 56 if (!aBuffer.reserve(aString.size())) { 57 return Err(ICUError::OutOfMemory); 58 } 59 return FillBufferWithICUCall( 60 aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { 61 return u_strToUpper(target, length, aString.data(), aString.size(), 62 aLocale, status); 63 }); 64 } 65 66 /** 67 * Normalization form constants to describe which normalization algorithm 68 * should be performed. 69 * 70 * Also see: 71 * - Unicode Standard, §2.12 Equivalent Sequences 72 * - Unicode Standard, §3.11 Normalization Forms 73 * - https://unicode.org/reports/tr15/ 74 */ 75 enum class NormalizationForm { 76 /** 77 * Normalization Form C 78 */ 79 NFC, 80 81 /** 82 * Normalization Form D 83 */ 84 NFD, 85 86 /** 87 * Normalization Form KC 88 */ 89 NFKC, 90 91 /** 92 * Normalization Form KD 93 */ 94 NFKD, 95 }; 96 97 enum class AlreadyNormalized : bool { No, Yes }; 98 99 /** 100 * Normalize the input string according to requested normalization form. 101 * 102 * Returns `AlreadyNormalized::Yes` when the string is already in normalized 103 * form. The output buffer is unchanged in this case. Otherwise returns 104 * `AlreadyNormalized::No` and places the normalized string into the output 105 * buffer. 106 */ 107 template <typename B> 108 static Result<AlreadyNormalized, ICUError> Normalize( 109 NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) { 110 // The unorm2_getXXXInstance() methods return a shared instance which must 111 // not be deleted. 112 UErrorCode status = U_ZERO_ERROR; 113 const UNormalizer2* normalizer; 114 switch (aForm) { 115 case NormalizationForm::NFC: 116 normalizer = unorm2_getNFCInstance(&status); 117 break; 118 case NormalizationForm::NFD: 119 normalizer = unorm2_getNFDInstance(&status); 120 break; 121 case NormalizationForm::NFKC: 122 normalizer = unorm2_getNFKCInstance(&status); 123 break; 124 case NormalizationForm::NFKD: 125 normalizer = unorm2_getNFKDInstance(&status); 126 break; 127 } 128 if (U_FAILURE(status)) { 129 return Err(ToICUError(status)); 130 } 131 132 int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(), 133 aString.size(), &status); 134 if (U_FAILURE(status)) { 135 return Err(ToICUError(status)); 136 } 137 138 size_t spanLength = AssertedCast<size_t>(spanLengthInt); 139 MOZ_ASSERT(spanLength <= aString.size()); 140 141 // Return if the input string is already normalized. 142 if (spanLength == aString.size()) { 143 return AlreadyNormalized::Yes; 144 } 145 146 if (!aBuffer.reserve(aString.size())) { 147 return Err(ICUError::OutOfMemory); 148 } 149 150 // Copy the already normalized prefix. 151 if (spanLength > 0) { 152 PodCopy(aBuffer.data(), aString.data(), spanLength); 153 154 aBuffer.written(spanLength); 155 } 156 157 MOZ_TRY(FillBufferWithICUCall( 158 aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { 159 Span<const char16_t> remaining = aString.From(spanLength); 160 return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength, 161 length, remaining.data(), 162 remaining.size(), status); 163 })); 164 165 return AlreadyNormalized::No; 166 } 167 168 /** 169 * Return true if the code point has the binary property "Cased". 170 */ 171 static bool IsCased(char32_t codePoint) { 172 return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED); 173 } 174 175 /** 176 * Return true if the code point has the binary property "Case_Ignorable". 177 */ 178 static bool IsCaseIgnorable(char32_t codePoint) { 179 return u_hasBinaryProperty(static_cast<UChar32>(codePoint), 180 UCHAR_CASE_IGNORABLE); 181 } 182 183 /** 184 * Return the NFC pairwise composition of the two input characters, if any; 185 * returns 0 (which we know is not a composed char!) if none exists. 186 */ 187 static char32_t ComposePairNFC(char32_t a, char32_t b) { 188 // unorm2_getNFCInstance returns a static instance that does not have to be 189 // released here. If it fails, we just return 0 (no composition) always. 190 static UErrorCode status = U_ZERO_ERROR; 191 static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); 192 if (U_FAILURE(status)) { 193 return 0; 194 } 195 UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a), 196 static_cast<UChar32>(b)); 197 return ch < 0 ? 0 : static_cast<char32_t>(ch); 198 } 199 200 /** 201 * Put the "raw" (single-level) canonical decomposition of the input char, if 202 * any, into the provided buffer. Canonical decomps are never more than two 203 * chars in length (although full normalization may result in longer output 204 * due to recursion). 205 * Returns the length of the decomposition (0 if none, else 1 or 2). 206 */ 207 static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) { 208 // unorm2_getNFCInstance returns a static instance that does not have to be 209 // released here. If it fails, we just return 0 (no decomposition) always. 210 // Although we are using it to query for a decomposition, the mode of the 211 // Normalizer2 is irrelevant here, so we may as well use the same singleton 212 // instance as ComposePairNFC. 213 static UErrorCode status = U_ZERO_ERROR; 214 static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status); 215 if (U_FAILURE(status)) { 216 return 0; 217 } 218 219 // Canonical decompositions are never more than two Unicode characters, 220 // or a maximum of 4 utf-16 code units. 221 const unsigned MAX_DECOMP_LENGTH = 4; 222 UErrorCode error = U_ZERO_ERROR; 223 UChar decompUtf16[MAX_DECOMP_LENGTH]; 224 int32_t len = 225 unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab), 226 decompUtf16, MAX_DECOMP_LENGTH, &error); 227 if (U_FAILURE(error) || len < 0) { 228 return 0; 229 } 230 UText text = UTEXT_INITIALIZER; 231 utext_openUChars(&text, decompUtf16, len, &error); 232 MOZ_ASSERT(U_SUCCESS(error)); 233 UChar32 ch = UTEXT_NEXT32(&text); 234 len = 0; 235 if (ch != U_SENTINEL) { 236 decomp[0] = static_cast<char32_t>(ch); 237 ++len; 238 ch = UTEXT_NEXT32(&text); 239 if (ch != U_SENTINEL) { 240 decomp[1] = static_cast<char32_t>(ch); 241 ++len; 242 } 243 } 244 utext_close(&text); 245 return len; 246 } 247 248 /** 249 * Return the Unicode version, for example "13.0". 250 */ 251 static Span<const char> GetUnicodeVersion(); 252 }; 253 254 } // namespace mozilla::intl 255 256 #endif