emojiprops.cpp (8095B)
1 // © 2021 and later: Unicode, Inc. and others. 2 // License & terms of use: https://www.unicode.org/copyright.html 3 4 // emojiprops.cpp 5 // created: 2021sep04 Markus W. Scherer 6 7 #include "unicode/utypes.h" 8 #include "unicode/uchar.h" 9 #include "unicode/ucharstrie.h" 10 #include "unicode/ucptrie.h" 11 #include "unicode/udata.h" 12 #include "unicode/ustringtrie.h" 13 #include "unicode/utf16.h" 14 #include "emojiprops.h" 15 #include "ucln.h" 16 #include "ucln_cmn.h" 17 #include "umutex.h" 18 #include "uset_imp.h" 19 20 U_NAMESPACE_BEGIN 21 22 namespace { 23 24 EmojiProps *singleton = nullptr; 25 icu::UInitOnce emojiInitOnce {}; 26 27 UBool U_CALLCONV emojiprops_cleanup() { 28 delete singleton; 29 singleton = nullptr; 30 emojiInitOnce.reset(); 31 return true; 32 } 33 34 void U_CALLCONV initSingleton(UErrorCode &errorCode) { 35 if (U_FAILURE(errorCode)) { return; } 36 singleton = new EmojiProps(errorCode); 37 if (singleton == nullptr) { 38 errorCode = U_MEMORY_ALLOCATION_ERROR; 39 } else if (U_FAILURE(errorCode)) { 40 delete singleton; 41 singleton = nullptr; 42 } 43 ucln_common_registerCleanup(UCLN_COMMON_EMOJIPROPS, emojiprops_cleanup); 44 } 45 46 // TODO: turn this into a shared helper function 47 // Requires the major version to match, and then requires at least the minor version. 48 UBool udata_isAcceptableMajorMinor( 49 const UDataInfo &info, const char16_t *dataFormat, uint8_t major, uint8_t minor) { 50 return 51 info.size >= 20 && 52 info.isBigEndian == U_IS_BIG_ENDIAN && 53 info.charsetFamily == U_CHARSET_FAMILY && 54 info.dataFormat[0] == dataFormat[0] && 55 info.dataFormat[1] == dataFormat[1] && 56 info.dataFormat[2] == dataFormat[2] && 57 info.dataFormat[3] == dataFormat[3] && 58 info.formatVersion[0] == major && 59 info.formatVersion[1] >= minor; 60 } 61 62 } // namespace 63 64 EmojiProps::~EmojiProps() { 65 udata_close(memory); 66 ucptrie_close(cpTrie); 67 } 68 69 const EmojiProps * 70 EmojiProps::getSingleton(UErrorCode &errorCode) { 71 if (U_FAILURE(errorCode)) { return nullptr; } 72 umtx_initOnce(emojiInitOnce, &initSingleton, errorCode); 73 return singleton; 74 } 75 76 UBool U_CALLCONV 77 EmojiProps::isAcceptable(void * /*context*/, const char * /*type*/, const char * /*name*/, 78 const UDataInfo *pInfo) { 79 return udata_isAcceptableMajorMinor(*pInfo, u"Emoj", 1, 0); 80 } 81 82 void 83 EmojiProps::load(UErrorCode &errorCode) { 84 memory = udata_openChoice(nullptr, "icu", "uemoji", isAcceptable, this, &errorCode); 85 if (U_FAILURE(errorCode)) { return; } 86 const uint8_t* inBytes = static_cast<const uint8_t*>(udata_getMemory(memory)); 87 const int32_t* inIndexes = reinterpret_cast<const int32_t*>(inBytes); 88 int32_t indexesLength = inIndexes[IX_CPTRIE_OFFSET] / 4; 89 if (indexesLength <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET) { 90 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes. 91 return; 92 } 93 94 int32_t i = IX_CPTRIE_OFFSET; 95 int32_t offset = inIndexes[i++]; 96 int32_t nextOffset = inIndexes[i]; 97 cpTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_8, 98 inBytes + offset, nextOffset - offset, nullptr, &errorCode); 99 if (U_FAILURE(errorCode)) { 100 return; 101 } 102 103 for (i = IX_BASIC_EMOJI_TRIE_OFFSET; i <= IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET; ++i) { 104 offset = inIndexes[i]; 105 nextOffset = inIndexes[i + 1]; 106 // Set/leave nullptr if there is no UCharsTrie. 107 const char16_t* p = nextOffset > offset ? reinterpret_cast<const char16_t*>(inBytes + offset) : nullptr; 108 stringTries[getStringTrieIndex(i)] = p; 109 } 110 } 111 112 void 113 EmojiProps::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { 114 // Add the start code point of each same-value range of the trie. 115 UChar32 start = 0, end; 116 uint32_t value; 117 while ((end = ucptrie_getRange(cpTrie, start, UCPMAP_RANGE_NORMAL, 0, 118 nullptr, nullptr, &value)) >= 0) { 119 sa->add(sa->set, start); 120 start = end + 1; 121 } 122 } 123 124 UBool 125 EmojiProps::hasBinaryProperty(UChar32 c, UProperty which) { 126 UErrorCode errorCode = U_ZERO_ERROR; 127 const EmojiProps *ep = getSingleton(errorCode); 128 return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(c, which); 129 } 130 131 UBool 132 EmojiProps::hasBinaryPropertyImpl(UChar32 c, UProperty which) const { 133 if (which < UCHAR_EMOJI || UCHAR_RGI_EMOJI < which) { 134 return false; 135 } 136 // Note: UCHAR_REGIONAL_INDICATOR is a single, hardcoded range implemented elsewhere. 137 static constexpr int8_t bitFlags[] = { 138 BIT_EMOJI, // UCHAR_EMOJI=57 139 BIT_EMOJI_PRESENTATION, // UCHAR_EMOJI_PRESENTATION=58 140 BIT_EMOJI_MODIFIER, // UCHAR_EMOJI_MODIFIER=59 141 BIT_EMOJI_MODIFIER_BASE, // UCHAR_EMOJI_MODIFIER_BASE=60 142 BIT_EMOJI_COMPONENT, // UCHAR_EMOJI_COMPONENT=61 143 -1, // UCHAR_REGIONAL_INDICATOR=62 144 -1, // UCHAR_PREPENDED_CONCATENATION_MARK=63 145 BIT_EXTENDED_PICTOGRAPHIC, // UCHAR_EXTENDED_PICTOGRAPHIC=64 146 BIT_BASIC_EMOJI, // UCHAR_BASIC_EMOJI=65 147 -1, // UCHAR_EMOJI_KEYCAP_SEQUENCE=66 148 -1, // UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE=67 149 -1, // UCHAR_RGI_EMOJI_FLAG_SEQUENCE=68 150 -1, // UCHAR_RGI_EMOJI_TAG_SEQUENCE=69 151 -1, // UCHAR_RGI_EMOJI_ZWJ_SEQUENCE=70 152 BIT_BASIC_EMOJI, // UCHAR_RGI_EMOJI=71 153 }; 154 int32_t bit = bitFlags[which - UCHAR_EMOJI]; 155 if (bit < 0) { 156 return false; // not a property that we support in this function 157 } 158 uint8_t bits = UCPTRIE_FAST_GET(cpTrie, UCPTRIE_8, c); 159 return (bits >> bit) & 1; 160 } 161 162 UBool 163 EmojiProps::hasBinaryProperty(const char16_t *s, int32_t length, UProperty which) { 164 UErrorCode errorCode = U_ZERO_ERROR; 165 const EmojiProps *ep = getSingleton(errorCode); 166 return U_SUCCESS(errorCode) && ep->hasBinaryPropertyImpl(s, length, which); 167 } 168 169 UBool 170 EmojiProps::hasBinaryPropertyImpl(const char16_t *s, int32_t length, UProperty which) const { 171 if (s == nullptr && length != 0) { return false; } 172 if (length <= 0 && (length == 0 || *s == 0)) { return false; } // empty string 173 // The caller should have delegated single code points to hasBinaryProperty(c, which). 174 if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) { 175 return false; 176 } 177 UProperty firstProp = which, lastProp = which; 178 if (which == UCHAR_RGI_EMOJI) { 179 // RGI_Emoji is the union of the other emoji properties of strings. 180 firstProp = UCHAR_BASIC_EMOJI; 181 lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE; 182 } 183 for (int32_t prop = firstProp; prop <= lastProp; ++prop) { 184 const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI]; 185 if (trieUChars != nullptr) { 186 UCharsTrie trie(trieUChars); 187 UStringTrieResult result = trie.next(s, length); 188 if (USTRINGTRIE_HAS_VALUE(result)) { 189 return true; 190 } 191 } 192 } 193 return false; 194 } 195 196 void 197 EmojiProps::addStrings(const USetAdder *sa, UProperty which, UErrorCode &errorCode) const { 198 if (U_FAILURE(errorCode)) { return; } 199 if (which < UCHAR_BASIC_EMOJI || UCHAR_RGI_EMOJI < which) { 200 return; 201 } 202 UProperty firstProp = which, lastProp = which; 203 if (which == UCHAR_RGI_EMOJI) { 204 // RGI_Emoji is the union of the other emoji properties of strings. 205 firstProp = UCHAR_BASIC_EMOJI; 206 lastProp = UCHAR_RGI_EMOJI_ZWJ_SEQUENCE; 207 } 208 for (int32_t prop = firstProp; prop <= lastProp; ++prop) { 209 const char16_t *trieUChars = stringTries[prop - UCHAR_BASIC_EMOJI]; 210 if (trieUChars != nullptr) { 211 UCharsTrie::Iterator iter(trieUChars, 0, errorCode); 212 while (iter.next(errorCode)) { 213 const UnicodeString &s = iter.getString(); 214 sa->addString(sa->set, s.getBuffer(), s.length()); 215 } 216 } 217 } 218 } 219 220 U_NAMESPACE_END