UnicodeProperties.h (12676B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 #ifndef intl_components_UnicodeProperties_h_ 5 #define intl_components_UnicodeProperties_h_ 6 7 #include "mozilla/intl/BidiClass.h" 8 #include "mozilla/intl/GeneralCategory.h" 9 #include "mozilla/intl/ICU4CGlue.h" 10 #include "mozilla/intl/UnicodeScriptCodes.h" 11 #include "mozilla/Vector.h" 12 13 #include "unicode/uchar.h" 14 #include "unicode/uscript.h" 15 16 namespace mozilla::intl { 17 18 /** 19 * This component is a Mozilla-focused API for working with text properties. 20 */ 21 class UnicodeProperties final { 22 public: 23 /** 24 * Return the BidiClass for the character. 25 */ 26 static inline BidiClass GetBidiClass(uint32_t aCh) { 27 return BidiClass(u_charDirection(aCh)); 28 } 29 30 /** 31 * Maps the specified character to a "mirror-image" character. 32 */ 33 static inline uint32_t CharMirror(uint32_t aCh) { return u_charMirror(aCh); } 34 35 /** 36 * Return the general category value for the code point. 37 */ 38 static inline GeneralCategory CharType(uint32_t aCh) { 39 return GeneralCategory(u_charType(aCh)); 40 } 41 42 /** 43 * Determine whether the code point has the Bidi_Mirrored property. 44 */ 45 static inline bool IsMirrored(uint32_t aCh) { return u_isMirrored(aCh); } 46 47 /** 48 * Returns the combining class of the code point as specified in 49 * UnicodeData.txt. 50 */ 51 static inline uint8_t GetCombiningClass(uint32_t aCh) { 52 return u_getCombiningClass(aCh); 53 } 54 55 enum class IntProperty { 56 BidiPairedBracketType, 57 EastAsianWidth, 58 HangulSyllableType, 59 IdentifierStatus, 60 LineBreak, 61 NumericType, 62 VerticalOrientation, 63 }; 64 65 /** 66 * Get the property value for an enumerated or integer Unicode property for a 67 * code point. 68 */ 69 static inline int32_t GetIntPropertyValue(uint32_t aCh, IntProperty aProp) { 70 UProperty prop; 71 switch (aProp) { 72 case IntProperty::BidiPairedBracketType: 73 prop = UCHAR_BIDI_PAIRED_BRACKET_TYPE; 74 break; 75 case IntProperty::EastAsianWidth: 76 prop = UCHAR_EAST_ASIAN_WIDTH; 77 break; 78 case IntProperty::HangulSyllableType: 79 prop = UCHAR_HANGUL_SYLLABLE_TYPE; 80 break; 81 case IntProperty::LineBreak: 82 prop = UCHAR_LINE_BREAK; 83 break; 84 case IntProperty::NumericType: 85 prop = UCHAR_NUMERIC_TYPE; 86 break; 87 case IntProperty::VerticalOrientation: 88 prop = UCHAR_VERTICAL_ORIENTATION; 89 break; 90 case IntProperty::IdentifierStatus: 91 prop = UCHAR_IDENTIFIER_STATUS; 92 break; 93 } 94 return u_getIntPropertyValue(aCh, prop); 95 } 96 97 /** 98 * Get the numeric value for a Unicode code point as defined in the 99 * Unicode Character Database if the input is decimal or a digit, 100 * otherwise, returns -1. 101 */ 102 static inline int8_t GetNumericValue(uint32_t aCh) { 103 UNumericType type = 104 UNumericType(GetIntPropertyValue(aCh, IntProperty::NumericType)); 105 return type == U_NT_DECIMAL || type == U_NT_DIGIT 106 ? int8_t(u_getNumericValue(aCh)) 107 : -1; 108 } 109 110 /** 111 * Maps the specified character to its paired bracket character. 112 */ 113 static inline uint32_t GetBidiPairedBracket(uint32_t aCh) { 114 return u_getBidiPairedBracket(aCh); 115 } 116 117 /** 118 * The given character is mapped to its uppercase equivalent according to 119 * UnicodeData.txt; if the character has no uppercase equivalent, the 120 * character itself is returned. 121 */ 122 static inline uint32_t ToUpper(uint32_t aCh) { return u_toupper(aCh); } 123 124 /** 125 * The given character is mapped to its lowercase equivalent according to 126 * UnicodeData.txt; if the character has no lowercase equivalent, the 127 * character itself is returned. 128 */ 129 static inline uint32_t ToLower(uint32_t aCh) { return u_tolower(aCh); } 130 131 /** 132 * Check if a code point has the Lowercase Unicode property. 133 */ 134 static inline bool IsLowercase(uint32_t aCh) { return u_isULowercase(aCh); } 135 136 /** 137 * The given character is mapped to its titlecase equivalent according to 138 * UnicodeData.txt; if the character has no titlecase equivalent, the 139 * character itself is returned. 140 */ 141 static inline uint32_t ToTitle(uint32_t aCh) { return u_totitle(aCh); } 142 143 /** 144 * The given character is mapped to its case folding equivalent according to 145 * UnicodeData.txt and CaseFolding.txt; 146 * if the character has no case folding equivalent, the character 147 * itself is returned. 148 */ 149 static inline uint32_t FoldCase(uint32_t aCh) { 150 return u_foldCase(aCh, U_FOLD_CASE_DEFAULT); 151 } 152 153 enum class BinaryProperty { 154 DefaultIgnorableCodePoint, 155 Emoji, 156 EmojiPresentation, 157 }; 158 159 /** 160 * Check a binary Unicode property for a code point. 161 */ 162 static inline bool HasBinaryProperty(uint32_t aCh, BinaryProperty aProp) { 163 UProperty prop; 164 switch (aProp) { 165 case BinaryProperty::DefaultIgnorableCodePoint: 166 prop = UCHAR_DEFAULT_IGNORABLE_CODE_POINT; 167 break; 168 case BinaryProperty::Emoji: 169 prop = UCHAR_EMOJI; 170 break; 171 case BinaryProperty::EmojiPresentation: 172 prop = UCHAR_EMOJI_PRESENTATION; 173 break; 174 } 175 return u_hasBinaryProperty(aCh, prop); 176 } 177 178 /** 179 * Check if the width of aCh is full width, half width or wide. 180 */ 181 static inline bool IsEastAsianWidthFHW(uint32_t aCh) { 182 switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { 183 case U_EA_FULLWIDTH: 184 case U_EA_HALFWIDTH: 185 case U_EA_WIDE: 186 return true; 187 case U_EA_AMBIGUOUS: 188 case U_EA_NARROW: 189 case U_EA_NEUTRAL: 190 return false; 191 } 192 return false; 193 } 194 195 /** 196 * Check if the width of aCh is full width, half width or wide 197 * excluding emoji. 198 */ 199 static inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) { 200 switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { 201 case U_EA_FULLWIDTH: 202 case U_EA_HALFWIDTH: 203 return true; 204 case U_EA_WIDE: 205 return HasBinaryProperty(aCh, BinaryProperty::Emoji) ? false : true; 206 case U_EA_AMBIGUOUS: 207 case U_EA_NARROW: 208 case U_EA_NEUTRAL: 209 return false; 210 } 211 return false; 212 } 213 214 /** 215 * Check if the width of aCh is ambiguous, full width, or wide. 216 */ 217 static inline bool IsEastAsianWidthAFW(uint32_t aCh) { 218 switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { 219 case U_EA_AMBIGUOUS: 220 case U_EA_FULLWIDTH: 221 case U_EA_WIDE: 222 return true; 223 case U_EA_HALFWIDTH: 224 case U_EA_NARROW: 225 case U_EA_NEUTRAL: 226 return false; 227 } 228 return false; 229 } 230 231 /** 232 * Check if the width of aCh is full width, or wide. 233 */ 234 static inline bool IsEastAsianWidthFW(uint32_t aCh) { 235 switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) { 236 case U_EA_FULLWIDTH: 237 case U_EA_WIDE: 238 return true; 239 case U_EA_AMBIGUOUS: 240 case U_EA_HALFWIDTH: 241 case U_EA_NARROW: 242 case U_EA_NEUTRAL: 243 return false; 244 } 245 return false; 246 } 247 248 /** 249 * Check if the width of aCh is East Asian Fullwidth (F). 250 */ 251 static inline bool IsEastAsianFullWidth(char32_t aCh) { 252 return GetIntPropertyValue(aCh, IntProperty::EastAsianWidth) == 253 U_EA_FULLWIDTH; 254 } 255 256 /** 257 * Check if the CharType of aCh is a letter type. 258 */ 259 static inline bool IsLetter(char32_t aCh) { 260 switch (CharType(aCh)) { 261 case GeneralCategory::Uppercase_Letter: 262 case GeneralCategory::Lowercase_Letter: 263 case GeneralCategory::Titlecase_Letter: 264 case GeneralCategory::Modifier_Letter: 265 case GeneralCategory::Other_Letter: 266 return true; 267 default: 268 return false; 269 } 270 } 271 272 /** 273 * Check if the CharType of aCh is a combining mark type. 274 */ 275 static inline bool IsCombiningMark(char32_t aCh) { 276 switch (CharType(aCh)) { 277 case GeneralCategory::Nonspacing_Mark: 278 case GeneralCategory::Spacing_Mark: 279 case GeneralCategory::Enclosing_Mark: 280 return true; 281 default: 282 return false; 283 } 284 } 285 286 /** 287 * Check if the CharType of aCh is a punctuation type. 288 */ 289 static inline bool IsPunctuation(uint32_t aCh) { 290 switch (CharType(aCh)) { 291 case GeneralCategory::Dash_Punctuation: 292 case GeneralCategory::Open_Punctuation: 293 case GeneralCategory::Close_Punctuation: 294 case GeneralCategory::Connector_Punctuation: 295 case GeneralCategory::Other_Punctuation: 296 case GeneralCategory::Initial_Punctuation: 297 case GeneralCategory::Final_Punctuation: 298 return true; 299 default: 300 return false; 301 } 302 } 303 304 /** 305 * Check if the CharType of aCh is math or other symbol. 306 */ 307 static inline bool IsMathOrMusicSymbol(uint32_t aCh) { 308 // Keep this function in sync with is_math_symbol in base_chars.py. 309 return CharType(aCh) == GeneralCategory::Math_Symbol || 310 CharType(aCh) == GeneralCategory::Other_Symbol; 311 } 312 313 static inline Script GetScriptCode(uint32_t aCh) { 314 // We can safely ignore the error code here because uscript_getScript 315 // returns USCRIPT_INVALID_CODE in the event of an error. 316 UErrorCode err = U_ZERO_ERROR; 317 return Script(uscript_getScript(aCh, &err)); 318 } 319 320 static inline bool HasScript(uint32_t aCh, Script aScript) { 321 return uscript_hasScript(aCh, UScriptCode(aScript)); 322 } 323 324 static inline const char* GetScriptShortName(Script aScript) { 325 return uscript_getShortName(UScriptCode(aScript)); 326 } 327 328 static inline int32_t GetMaxNumberOfScripts() { 329 return u_getIntPropertyMaxValue(UCHAR_SCRIPT); 330 } 331 332 // Return true if aChar belongs to a SEAsian script that is written without 333 // word spaces, so we need to use the "complex breaker" to find possible word 334 // boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua) 335 static bool IsScriptioContinua(char16_t aChar) { 336 Script sc = GetScriptCode(aChar); 337 return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER || 338 sc == Script::JAVANESE || sc == Script::BALINESE || 339 sc == Script::SUNDANESE || sc == Script::LAO; 340 } 341 342 // Return true if aChar belongs to a cursive script for which inter-character 343 // justification should be disabled. 344 static bool IsCursiveScript(char32_t aChar) { 345 Script sc = GetScriptCode(aChar); 346 return sc == Script::ARABIC || sc == Script::SYRIAC || sc == Script::NKO || 347 sc == Script::MANDAIC || sc == Script::MONGOLIAN || 348 sc == Script::PHAGS_PA || sc == Script::HANIFI_ROHINGYA; 349 } 350 351 // The code point which has the most script extensions is 0x0965, which has 21 352 // script extensions, so choose the vector size as 32 to prevent heap 353 // allocation. 354 static constexpr size_t kMaxScripts = 32; 355 356 using ScriptExtensionVector = Vector<Script, kMaxScripts>; 357 358 /** 359 * Get the script extensions for the given code point, and write the script 360 * extensions to aExtensions vector. If the code point has script extensions, 361 * the script code (Script::COMMON or Script::INHERITED) will be excluded. 362 * 363 * If the code point doesn't have any script extension, then its script code 364 * will be written to aExtensions vector. 365 * 366 * If the code point is invalid, Script::UNKNOWN will be written to 367 * aExtensions vector. 368 * 369 * Note: aExtensions will be cleared after calling this method regardless of 370 * failure. 371 * 372 * See [1] for the script code of the code point, [2] for the script 373 * extensions. 374 * 375 * https://www.unicode.org/Public/UNIDATA/Scripts.txt 376 * https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt 377 */ 378 static ICUResult GetExtensions(char32_t aCodePoint, 379 ScriptExtensionVector& aExtensions) { 380 // Clear the vector first. 381 aExtensions.clear(); 382 383 // We cannot pass aExtensions to uscript_getScriptExtension as USCriptCode 384 // takes 4 bytes, so create a local UScriptCode array to get the extensions. 385 UScriptCode ext[kMaxScripts]; 386 UErrorCode status = U_ZERO_ERROR; 387 int32_t len = uscript_getScriptExtensions(static_cast<UChar32>(aCodePoint), 388 ext, kMaxScripts, &status); 389 if (U_FAILURE(status)) { 390 // kMaxScripts should be large enough to hold the maximun number of script 391 // extensions. 392 MOZ_DIAGNOSTIC_ASSERT(status != U_BUFFER_OVERFLOW_ERROR); 393 return Err(ToICUError(status)); 394 } 395 396 if (!aExtensions.reserve(len)) { 397 return Err(ICUError::OutOfMemory); 398 } 399 400 for (int32_t i = 0; i < len; i++) { 401 aExtensions.infallibleAppend(Script(ext[i])); 402 } 403 404 return Ok(); 405 } 406 }; 407 408 } // namespace mozilla::intl 409 410 #endif