nsUnicodeProperties.cpp (8342B)
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=4 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "nsUnicodeProperties.h" 8 #include "nsUnicodePropertyData.cpp" 9 10 #include "mozilla/intl/Segmenter.h" 11 12 #include "BaseChars.h" 13 #include "IsCombiningDiacritic.h" 14 15 #define UNICODE_BMP_LIMIT 0x10000 16 17 namespace mozilla { 18 19 namespace unicode { 20 21 /* 22 To store properties for a million Unicode codepoints compactly, we use 23 a three-level array structure, with the Unicode values considered as 24 three elements: Plane, Page, and Char. 25 26 Space optimization happens because multiple Planes can refer to the same 27 Page array, and multiple Pages can refer to the same Char array holding 28 the actual values. In practice, most of the higher planes are empty and 29 thus share the same data; and within the BMP, there are also many pages 30 that repeat the same data for any given property. 31 32 Plane is usually zero, so we skip a lookup in this case, and require 33 that the Plane 0 pages are always the first set of entries in the Page 34 array. 35 36 The division of the remaining 16 bits into Page and Char fields is 37 adjusted for each property (by experiment using the generation tool) 38 to provide the most compact storage, depending on the distribution 39 of values. 40 */ 41 42 const nsUGenCategory sDetailedToGeneralCategory[] = { 43 // clang-format off 44 /* 45 * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants 46 * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h. 47 */ 48 /* CONTROL */ nsUGenCategory::kOther, 49 /* FORMAT */ nsUGenCategory::kOther, 50 /* UNASSIGNED */ nsUGenCategory::kOther, 51 /* PRIVATE_USE */ nsUGenCategory::kOther, 52 /* SURROGATE */ nsUGenCategory::kOther, 53 /* LOWERCASE_LETTER */ nsUGenCategory::kLetter, 54 /* MODIFIER_LETTER */ nsUGenCategory::kLetter, 55 /* OTHER_LETTER */ nsUGenCategory::kLetter, 56 /* TITLECASE_LETTER */ nsUGenCategory::kLetter, 57 /* UPPERCASE_LETTER */ nsUGenCategory::kLetter, 58 /* COMBINING_MARK */ nsUGenCategory::kMark, 59 /* ENCLOSING_MARK */ nsUGenCategory::kMark, 60 /* NON_SPACING_MARK */ nsUGenCategory::kMark, 61 /* DECIMAL_NUMBER */ nsUGenCategory::kNumber, 62 /* LETTER_NUMBER */ nsUGenCategory::kNumber, 63 /* OTHER_NUMBER */ nsUGenCategory::kNumber, 64 /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation, 65 /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation, 66 /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation, 67 /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation, 68 /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation, 69 /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation, 70 /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation, 71 /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol, 72 /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol, 73 /* MATH_SYMBOL */ nsUGenCategory::kSymbol, 74 /* OTHER_SYMBOL */ nsUGenCategory::kSymbol, 75 /* LINE_SEPARATOR */ nsUGenCategory::kSeparator, 76 /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator, 77 /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator 78 // clang-format on 79 }; 80 81 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = { 82 // clang-format off 83 HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0, 84 HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1, 85 HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2, 86 HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3, 87 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4, 88 HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5, 89 HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6, 90 HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7, 91 HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8, 92 HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9, 93 HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10, 94 HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11, 95 HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12, 96 HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13, 97 HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14, 98 HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15, 99 HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16, 100 HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17, 101 HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18, 102 HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19, 103 HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20, 104 HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21, 105 HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22, 106 HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23, 107 HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24, 108 HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25, 109 HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26, 110 HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27, 111 HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28, 112 HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29, 113 // clang-format on 114 }; 115 116 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \ 117 uint32_t Get##prefix_(uint32_t aCh) { \ 118 if (aCh >= UNICODE_BMP_LIMIT) { \ 119 return aCh; \ 120 } \ 121 auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \ 122 auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \ 123 uint32_t v = s##prefix_##Values[page][index]; \ 124 return v ? v : aCh; \ 125 } 126 127 // full-width mappings only exist for BMP characters; all others are 128 // returned unchanged 129 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth) 130 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse) 131 132 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) { 133 return ( 134 (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && 135 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || 136 (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ 137 (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks 138 (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers 139 (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters 140 } 141 142 bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory) { 143 return ( 144 (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK && 145 aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) || 146 (aCh >= 0xff9e && aCh <= 0xff9f) || // katakana sound marks 147 (aCh >= 0x1F3FB && aCh <= 0x1F3FF) || // fitzpatrick skin tone modifiers 148 (aCh >= 0xe0020 && aCh <= 0xe007f)); // emoji (flag) tag characters 149 } 150 151 uint32_t CountGraphemeClusters(Span<const char16_t> aText) { 152 if (aText.IsEmpty()) { 153 // Fast path for empty text. 154 return 0; 155 } 156 intl::GraphemeClusterBreakIteratorUtf16 iter(aText); 157 uint32_t result = 0; 158 while (iter.Next()) { 159 ++result; 160 } 161 return result; 162 } 163 164 uint32_t GetNaked(uint32_t aCh) { 165 uint32_t index = aCh >> 8; 166 if (index >= std::size(BASE_CHAR_MAPPING_BLOCK_INDEX)) { 167 return aCh; 168 } 169 index = BASE_CHAR_MAPPING_BLOCK_INDEX[index]; 170 if (index == 0xff) { 171 return aCh; 172 } 173 const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index]; 174 uint8_t lo = aCh & 0xff; 175 if (lo < block.mFirst || lo > block.mLast) { 176 return aCh; 177 } 178 return (aCh & 0xffff0000) | 179 BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst]; 180 } 181 182 bool IsCombiningDiacritic(uint32_t aCh) { 183 return sCombiningDiacriticsSet->test(aCh); 184 } 185 186 } // end namespace unicode 187 188 } // end namespace mozilla