static_unicode_sets.cpp (9773B)
1 // © 2018 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 9 // Helpful in toString methods and elsewhere. 10 #define UNISTR_FROM_STRING_EXPLICIT 11 12 #include "static_unicode_sets.h" 13 #include "umutex.h" 14 #include "ucln_cmn.h" 15 #include "unicode/uniset.h" 16 #include "uresimp.h" 17 #include "cstring.h" 18 #include "uassert.h" 19 20 using namespace icu; 21 using namespace icu::unisets; 22 23 24 namespace { 25 26 UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {}; 27 28 // Save the empty instance in static memory to have well-defined behavior if a 29 // regular UnicodeSet cannot be allocated. 30 alignas(UnicodeSet) 31 char gEmptyUnicodeSet[sizeof(UnicodeSet)]; 32 33 // Whether the gEmptyUnicodeSet is initialized and ready to use. 34 UBool gEmptyUnicodeSetInitialized = false; 35 36 inline UnicodeSet* getImpl(Key key) { 37 UnicodeSet* candidate = gUnicodeSets[key]; 38 if (candidate == nullptr) { 39 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); 40 } 41 return candidate; 42 } 43 44 UnicodeSet* computeUnion(Key k1, Key k2) { 45 UnicodeSet* result = new UnicodeSet(); 46 if (result == nullptr) { 47 return nullptr; 48 } 49 result->addAll(*getImpl(k1)); 50 result->addAll(*getImpl(k2)); 51 result->freeze(); 52 return result; 53 } 54 55 UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { 56 UnicodeSet* result = new UnicodeSet(); 57 if (result == nullptr) { 58 return nullptr; 59 } 60 result->addAll(*getImpl(k1)); 61 result->addAll(*getImpl(k2)); 62 result->addAll(*getImpl(k3)); 63 result->freeze(); 64 return result; 65 } 66 67 68 void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { 69 // assert unicodeSets.get(key) == null; 70 gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status); 71 } 72 73 class ParseDataSink : public ResourceSink { 74 public: 75 void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override { 76 ResourceTable contextsTable = value.getTable(status); 77 if (U_FAILURE(status)) { return; } 78 for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { 79 if (uprv_strcmp(key, "date") == 0) { 80 // ignore 81 } else { 82 ResourceTable strictnessTable = value.getTable(status); 83 if (U_FAILURE(status)) { return; } 84 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { 85 bool isLenient = (uprv_strcmp(key, "lenient") == 0); 86 ResourceArray array = value.getArray(status); 87 if (U_FAILURE(status)) { return; } 88 for (int k = 0; k < array.getSize(); k++) { 89 array.getValue(k, value); 90 UnicodeString str = value.getUnicodeString(status); 91 if (U_FAILURE(status)) { return; } 92 // There is both lenient and strict data for comma/period, 93 // but not for any of the other symbols. 94 if (str.indexOf(u'.') != -1) { 95 saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status); 96 } else if (str.indexOf(u',') != -1) { 97 saveSet(isLenient ? COMMA : STRICT_COMMA, str, status); 98 } else if (str.indexOf(u'+') != -1) { 99 saveSet(PLUS_SIGN, str, status); 100 } else if (str.indexOf(u'-') != -1) { 101 saveSet(MINUS_SIGN, str, status); 102 } else if (str.indexOf(u'$') != -1) { 103 saveSet(DOLLAR_SIGN, str, status); 104 } else if (str.indexOf(u'£') != -1) { 105 saveSet(POUND_SIGN, str, status); 106 } else if (str.indexOf(u'₹') != -1) { 107 saveSet(RUPEE_SIGN, str, status); 108 } else if (str.indexOf(u'¥') != -1) { 109 saveSet(YEN_SIGN, str, status); 110 } else if (str.indexOf(u'₩') != -1) { 111 saveSet(WON_SIGN, str, status); 112 } else if (str.indexOf(u'%') != -1) { 113 saveSet(PERCENT_SIGN, str, status); 114 } else if (str.indexOf(u'‰') != -1) { 115 saveSet(PERMILLE_SIGN, str, status); 116 } else if (str.indexOf(u'’') != -1) { 117 saveSet(APOSTROPHE_SIGN, str, status); 118 } else { 119 // Unknown class of parse lenients 120 // TODO(ICU-20428): Make ICU automatically accept new classes? 121 U_ASSERT(false); 122 } 123 if (U_FAILURE(status)) { return; } 124 } 125 } 126 } 127 } 128 } 129 }; 130 131 132 icu::UInitOnce gNumberParseUniSetsInitOnce {}; 133 134 UBool U_CALLCONV cleanupNumberParseUniSets() { 135 if (gEmptyUnicodeSetInitialized) { 136 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet(); 137 gEmptyUnicodeSetInitialized = false; 138 } 139 for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { 140 delete gUnicodeSets[i]; 141 gUnicodeSets[i] = nullptr; 142 } 143 gNumberParseUniSetsInitOnce.reset(); 144 return true; 145 } 146 147 void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { 148 ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets); 149 150 // Initialize the empty instance for well-defined fallback behavior 151 new(gEmptyUnicodeSet) UnicodeSet(); 152 reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze(); 153 gEmptyUnicodeSetInitialized = true; 154 155 // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. 156 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). 157 gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet( 158 u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status); 159 gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status); 160 161 LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status)); 162 if (U_FAILURE(status)) { return; } 163 ParseDataSink sink; 164 ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status); 165 if (U_FAILURE(status)) { return; } 166 167 // NOTE: It is OK for these assertions to fail if there was a no-data build. 168 U_ASSERT(gUnicodeSets[COMMA] != nullptr); 169 U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr); 170 U_ASSERT(gUnicodeSets[PERIOD] != nullptr); 171 U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr); 172 U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr); 173 174 LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet( 175 u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", 176 status 177 ), status); 178 if (U_FAILURE(status)) { return; } 179 otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]); 180 gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan(); 181 gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); 182 gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( 183 STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); 184 185 U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr); 186 U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr); 187 U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr); 188 U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr); 189 190 // The following don't currently have parseLenients in data. 191 U_ASSERT(gUnicodeSets[INFINITY_SIGN] == nullptr); 192 gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status); 193 U_ASSERT(gUnicodeSets[APPROXIMATELY_SIGN] == nullptr); 194 // This set of characters was manually curated from the 195 // values of the approximatelySign element of CLDR common/main/*.xml files. 196 gUnicodeSets[APPROXIMATELY_SIGN] = new UnicodeSet(u"[∼~≈≃約]", status); 197 if (U_FAILURE(status)) { return; } 198 199 U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr); 200 U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr); 201 U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr); 202 U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr); 203 U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr); 204 205 gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); 206 if (U_FAILURE(status)) { return; } 207 gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); 208 gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); 209 210 for (auto* uniset : gUnicodeSets) { 211 if (uniset != nullptr) { 212 uniset->freeze(); 213 } 214 } 215 } 216 217 } 218 219 const UnicodeSet* unisets::get(Key key) { 220 UErrorCode localStatus = U_ZERO_ERROR; 221 umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); 222 if (U_FAILURE(localStatus)) { 223 return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); 224 } 225 return getImpl(key); 226 } 227 228 Key unisets::chooseFrom(UnicodeString str, Key key1) { 229 return get(key1)->contains(str) ? key1 : NONE; 230 } 231 232 Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { 233 return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); 234 } 235 236 //Key unisets::chooseCurrency(UnicodeString str) { 237 // if (get(DOLLAR_SIGN)->contains(str)) { 238 // return DOLLAR_SIGN; 239 // } else if (get(POUND_SIGN)->contains(str)) { 240 // return POUND_SIGN; 241 // } else if (get(RUPEE_SIGN)->contains(str)) { 242 // return RUPEE_SIGN; 243 // } else if (get(YEN_SIGN)->contains(str)) { 244 // return YEN_SIGN; 245 // } else { 246 // return NONE; 247 // } 248 //} 249 250 251 #endif /* #if !UCONFIG_NO_FORMATTING */