nsUnicodeProperties.h (7173B)
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=4 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef NS_UNICODEPROPERTIES_H 8 #define NS_UNICODEPROPERTIES_H 9 10 #include "mozilla/intl/UnicodeProperties.h" 11 12 #include "mozilla/Span.h" 13 #include "nsBidiUtils.h" 14 #include "nsUGenCategory.h" 15 #include "harfbuzz/hb.h" 16 17 namespace mozilla { 18 19 namespace unicode { 20 21 extern const nsUGenCategory sDetailedToGeneralCategory[]; 22 23 /* This values must match the values by UVerticalOrientation by ICU */ 24 enum VerticalOrientation { 25 VERTICAL_ORIENTATION_R = 0, 26 VERTICAL_ORIENTATION_Tr = 1, 27 VERTICAL_ORIENTATION_Tu = 2, 28 VERTICAL_ORIENTATION_U = 3, 29 }; 30 31 /* This MUST match the values assigned by genUnicodePropertyData.pl! */ 32 enum PairedBracketType { 33 PAIRED_BRACKET_TYPE_NONE = 0, 34 PAIRED_BRACKET_TYPE_OPEN = 1, 35 PAIRED_BRACKET_TYPE_CLOSE = 2 36 }; 37 38 /* This values must match the values by UIdentifierStatus by ICU */ 39 enum IdentifierType { 40 IDTYPE_RESTRICTED = 0, 41 IDTYPE_ALLOWED = 1, 42 }; 43 44 enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 }; 45 46 const uint32_t kVariationSelector15 = 0xFE0E; // text presentation 47 const uint32_t kVariationSelector16 = 0xFE0F; // emoji presentation 48 static inline bool IsEmojiPresentationSelector(uint32_t aCh) { 49 return aCh >= kVariationSelector15 && aCh <= kVariationSelector16; 50 } 51 52 // Unicode values for EMOJI MODIFIER FITZPATRICK TYPE-* 53 const uint32_t kEmojiSkinToneFirst = 0x1f3fb; 54 const uint32_t kEmojiSkinToneLast = 0x1f3ff; 55 static inline bool IsEmojiSkinToneModifier(uint32_t aCh) { 56 return aCh >= kEmojiSkinToneFirst && aCh <= kEmojiSkinToneLast; 57 } 58 59 extern const hb_unicode_general_category_t sICUtoHBcategory[]; 60 61 // NOTE: This returns values matching harfbuzz HB_UNICODE_GENERAL_CATEGORY_* 62 // constants, NOT the mozilla::intl::GeneralCategory enum. 63 // For the GeneralCategory enum, use intl::UnicodeProperties::CharType itself. 64 inline uint8_t GetGeneralCategory(uint32_t aCh) { 65 return sICUtoHBcategory[unsigned(intl::UnicodeProperties::CharType(aCh))]; 66 } 67 68 inline int8_t GetNumericValue(uint32_t aCh) { 69 return intl::UnicodeProperties::GetNumericValue(aCh); 70 } 71 72 inline uint8_t GetLineBreakClass(uint32_t aCh) { 73 return intl::UnicodeProperties::GetIntPropertyValue( 74 aCh, intl::UnicodeProperties::IntProperty::LineBreak); 75 } 76 77 inline uint32_t GetScriptTagForCode(intl::Script aScriptCode) { 78 const char* tag = intl::UnicodeProperties::GetScriptShortName(aScriptCode); 79 if (tag) { 80 return HB_TAG(tag[0], tag[1], tag[2], tag[3]); 81 } 82 // return UNKNOWN script tag (running with older ICU?) 83 return HB_SCRIPT_UNKNOWN; 84 } 85 86 inline PairedBracketType GetPairedBracketType(uint32_t aCh) { 87 return PairedBracketType(intl::UnicodeProperties::GetIntPropertyValue( 88 aCh, intl::UnicodeProperties::IntProperty::BidiPairedBracketType)); 89 } 90 91 inline uint32_t GetTitlecaseForLower( 92 uint32_t aCh) // maps LC to titlecase, UC unchanged 93 { 94 return intl::UnicodeProperties::IsLowercase(aCh) 95 ? intl::UnicodeProperties::ToTitle(aCh) 96 : aCh; 97 } 98 99 inline uint32_t GetTitlecaseForAll( 100 uint32_t aCh) // maps both UC and LC to titlecase 101 { 102 return intl::UnicodeProperties::ToTitle(aCh); 103 } 104 105 inline uint32_t GetFoldedcase(uint32_t aCh) { 106 // Handle dotted capital I and dotless small i specially because we want to 107 // use a combination of ordinary case-folding rules and Turkish case-folding 108 // rules. 109 if (aCh == 0x0130 || aCh == 0x0131) { 110 return 'i'; 111 } 112 return intl::UnicodeProperties::FoldCase(aCh); 113 } 114 115 inline bool IsDefaultIgnorable(uint32_t aCh) { 116 return intl::UnicodeProperties::HasBinaryProperty( 117 aCh, intl::UnicodeProperties::BinaryProperty::DefaultIgnorableCodePoint); 118 } 119 120 inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) { 121 if (!intl::UnicodeProperties::HasBinaryProperty( 122 aCh, intl::UnicodeProperties::BinaryProperty::Emoji)) { 123 return TextOnly; 124 } 125 126 if (intl::UnicodeProperties::HasBinaryProperty( 127 aCh, intl::UnicodeProperties::BinaryProperty::EmojiPresentation)) { 128 return EmojiDefault; 129 } 130 return TextDefault; 131 } 132 133 // returns the simplified Gen Category as defined in nsUGenCategory 134 inline nsUGenCategory GetGenCategory(uint32_t aCh) { 135 return sDetailedToGeneralCategory[GetGeneralCategory(aCh)]; 136 } 137 138 inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) { 139 return VerticalOrientation(intl::UnicodeProperties::GetIntPropertyValue( 140 aCh, intl::UnicodeProperties::IntProperty::VerticalOrientation)); 141 } 142 143 inline IdentifierType GetIdentifierType(uint32_t aCh) { 144 return IdentifierType(intl::UnicodeProperties::GetIntPropertyValue( 145 aCh, intl::UnicodeProperties::IntProperty::IdentifierStatus)); 146 } 147 148 uint32_t GetFullWidth(uint32_t aCh); 149 // This is the reverse function of GetFullWidth which guarantees that 150 // for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c. 151 // Note that, this function does not guarantee to convert all wide 152 // form characters to their possible narrow form. 153 uint32_t GetFullWidthInverse(uint32_t aCh); 154 155 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory); 156 157 inline bool IsClusterExtender(uint32_t aCh) { 158 // There are no cluster-extender characters before the first combining- 159 // character block at U+03xx, so we short-circuit here to avoid the cost 160 // of calling GetGeneralCategory for Latin-1 letters etc. 161 return aCh >= 0x0300 && IsClusterExtender(aCh, GetGeneralCategory(aCh)); 162 } 163 164 bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory); 165 166 inline bool IsClusterExtenderExcludingJoiners(uint32_t aCh) { 167 return aCh >= 0x0300 && 168 IsClusterExtenderExcludingJoiners(aCh, GetGeneralCategory(aCh)); 169 } 170 171 // Count the number of grapheme clusters in the given string 172 uint32_t CountGraphemeClusters(Span<const char16_t> aText); 173 174 // Determine whether a character is a "combining diacritic" for the purpose 175 // of diacritic-insensitive text search. Examples of such characters include 176 // European accents and Hebrew niqqud, but not Hangul components or Thaana 177 // vowels, even though Thaana vowels are combining nonspacing marks that could 178 // be considered diacritics. 179 // As an exception to strictly following Unicode properties, we exclude the 180 // Japanese kana voicing marks 181 // 3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM 182 // 309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM 183 // which users report should not be ignored (bug 1624244). 184 // See is_combining_diacritic in base_chars.py and is_combining_diacritic.py. 185 // 186 // TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode 187 // properties, re-evaluate whether building the static bitset is worthwhile 188 // or if we can revert to simply getting the combining class and comparing 189 // to the values we care about at runtime. 190 bool IsCombiningDiacritic(uint32_t aCh); 191 192 // Remove diacritics from a character 193 uint32_t GetNaked(uint32_t aCh); 194 195 } // end namespace unicode 196 197 } // end namespace mozilla 198 199 #endif /* NS_UNICODEPROPERTIES_H */