tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

static_unicode_sets.cpp (9773B)


      1 // © 2018 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 
      6 #if !UCONFIG_NO_FORMATTING
      7 
      8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
      9 // Helpful in toString methods and elsewhere.
     10 #define UNISTR_FROM_STRING_EXPLICIT
     11 
     12 #include "static_unicode_sets.h"
     13 #include "umutex.h"
     14 #include "ucln_cmn.h"
     15 #include "unicode/uniset.h"
     16 #include "uresimp.h"
     17 #include "cstring.h"
     18 #include "uassert.h"
     19 
     20 using namespace icu;
     21 using namespace icu::unisets;
     22 
     23 
     24 namespace {
     25 
     26 UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
     27 
     28 // Save the empty instance in static memory to have well-defined behavior if a
     29 // regular UnicodeSet cannot be allocated.
     30 alignas(UnicodeSet)
     31 char gEmptyUnicodeSet[sizeof(UnicodeSet)];
     32 
     33 // Whether the gEmptyUnicodeSet is initialized and ready to use.
     34 UBool gEmptyUnicodeSetInitialized = false;
     35 
     36 inline UnicodeSet* getImpl(Key key) {
     37    UnicodeSet* candidate = gUnicodeSets[key];
     38    if (candidate == nullptr) {
     39        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
     40    }
     41    return candidate;
     42 }
     43 
     44 UnicodeSet* computeUnion(Key k1, Key k2) {
     45    UnicodeSet* result = new UnicodeSet();
     46    if (result == nullptr) {
     47        return nullptr;
     48    }
     49    result->addAll(*getImpl(k1));
     50    result->addAll(*getImpl(k2));
     51    result->freeze();
     52    return result;
     53 }
     54 
     55 UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
     56    UnicodeSet* result = new UnicodeSet();
     57    if (result == nullptr) {
     58        return nullptr;
     59    }
     60    result->addAll(*getImpl(k1));
     61    result->addAll(*getImpl(k2));
     62    result->addAll(*getImpl(k3));
     63    result->freeze();
     64    return result;
     65 }
     66 
     67 
     68 void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
     69    // assert unicodeSets.get(key) == null;
     70    gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
     71 }
     72 
     73 class ParseDataSink : public ResourceSink {
     74  public:
     75    void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override {
     76        ResourceTable contextsTable = value.getTable(status);
     77        if (U_FAILURE(status)) { return; }
     78        for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
     79            if (uprv_strcmp(key, "date") == 0) {
     80                // ignore
     81            } else {
     82                ResourceTable strictnessTable = value.getTable(status);
     83                if (U_FAILURE(status)) { return; }
     84                for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
     85                    bool isLenient = (uprv_strcmp(key, "lenient") == 0);
     86                    ResourceArray array = value.getArray(status);
     87                    if (U_FAILURE(status)) { return; }
     88                    for (int k = 0; k < array.getSize(); k++) {
     89                        array.getValue(k, value);
     90                        UnicodeString str = value.getUnicodeString(status);
     91                        if (U_FAILURE(status)) { return; }
     92                        // There is both lenient and strict data for comma/period,
     93                        // but not for any of the other symbols.
     94                        if (str.indexOf(u'.') != -1) {
     95                            saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
     96                        } else if (str.indexOf(u',') != -1) {
     97                            saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
     98                        } else if (str.indexOf(u'+') != -1) {
     99                            saveSet(PLUS_SIGN, str, status);
    100                        } else if (str.indexOf(u'-') != -1) {
    101                            saveSet(MINUS_SIGN, str, status);
    102                        } else if (str.indexOf(u'$') != -1) {
    103                            saveSet(DOLLAR_SIGN, str, status);
    104                        } else if (str.indexOf(u'£') != -1) {
    105                            saveSet(POUND_SIGN, str, status);
    106                        } else if (str.indexOf(u'₹') != -1) {
    107                            saveSet(RUPEE_SIGN, str, status);
    108                        } else if (str.indexOf(u'¥') != -1) {
    109                            saveSet(YEN_SIGN, str, status);
    110                        } else if (str.indexOf(u'₩') != -1) {
    111                            saveSet(WON_SIGN, str, status);
    112                        } else if (str.indexOf(u'%') != -1) {
    113                            saveSet(PERCENT_SIGN, str, status);
    114                        } else if (str.indexOf(u'‰') != -1) {
    115                            saveSet(PERMILLE_SIGN, str, status);
    116                        } else if (str.indexOf(u'’') != -1) {
    117                            saveSet(APOSTROPHE_SIGN, str, status);
    118                        } else {
    119                            // Unknown class of parse lenients
    120                            // TODO(ICU-20428): Make ICU automatically accept new classes?
    121                            U_ASSERT(false);
    122                        }
    123                        if (U_FAILURE(status)) { return; }
    124                    }
    125                }
    126            }
    127        }
    128    }
    129 };
    130 
    131 
    132 icu::UInitOnce gNumberParseUniSetsInitOnce {};
    133 
    134 UBool U_CALLCONV cleanupNumberParseUniSets() {
    135    if (gEmptyUnicodeSetInitialized) {
    136        reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
    137        gEmptyUnicodeSetInitialized = false;
    138    }
    139    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
    140        delete gUnicodeSets[i];
    141        gUnicodeSets[i] = nullptr;
    142    }
    143    gNumberParseUniSetsInitOnce.reset();
    144    return true;
    145 }
    146 
    147 void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
    148    ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
    149 
    150    // Initialize the empty instance for well-defined fallback behavior
    151    new(gEmptyUnicodeSet) UnicodeSet();
    152    reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
    153    gEmptyUnicodeSetInitialized = true;
    154 
    155    // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
    156    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
    157    gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
    158            u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
    159    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
    160 
    161    LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
    162    if (U_FAILURE(status)) { return; }
    163    ParseDataSink sink;
    164    ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
    165    if (U_FAILURE(status)) { return; }
    166 
    167    // NOTE: It is OK for these assertions to fail if there was a no-data build.
    168    U_ASSERT(gUnicodeSets[COMMA] != nullptr);
    169    U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
    170    U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
    171    U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
    172    U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
    173 
    174    LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
    175        u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
    176        status
    177    ), status);
    178    if (U_FAILURE(status)) { return; }
    179    otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
    180    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
    181    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
    182    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
    183            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
    184 
    185    U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
    186    U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
    187    U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
    188    U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
    189 
    190    // The following don't currently have parseLenients in data.
    191    U_ASSERT(gUnicodeSets[INFINITY_SIGN] == nullptr);
    192    gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
    193    U_ASSERT(gUnicodeSets[APPROXIMATELY_SIGN] == nullptr);
    194    // This set of characters was manually curated from the
    195    // values of the approximatelySign element of CLDR common/main/*.xml files.
    196    gUnicodeSets[APPROXIMATELY_SIGN] = new UnicodeSet(u"[∼~≈≃約]", status);
    197    if (U_FAILURE(status)) { return; }
    198 
    199    U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
    200    U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
    201    U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
    202    U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
    203    U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
    204 
    205    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
    206    if (U_FAILURE(status)) { return; }
    207    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
    208    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
    209 
    210    for (auto* uniset : gUnicodeSets) {
    211        if (uniset != nullptr) {
    212            uniset->freeze();
    213        }
    214    }
    215 }
    216 
    217 }
    218 
    219 const UnicodeSet* unisets::get(Key key) {
    220    UErrorCode localStatus = U_ZERO_ERROR;
    221    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
    222    if (U_FAILURE(localStatus)) {
    223        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
    224    }
    225    return getImpl(key);
    226 }
    227 
    228 Key unisets::chooseFrom(UnicodeString str, Key key1) {
    229    return get(key1)->contains(str) ? key1 : NONE;
    230 }
    231 
    232 Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
    233    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
    234 }
    235 
    236 //Key unisets::chooseCurrency(UnicodeString str) {
    237 //    if (get(DOLLAR_SIGN)->contains(str)) {
    238 //        return DOLLAR_SIGN;
    239 //    } else if (get(POUND_SIGN)->contains(str)) {
    240 //        return POUND_SIGN;
    241 //    } else if (get(RUPEE_SIGN)->contains(str)) {
    242 //        return RUPEE_SIGN;
    243 //    } else if (get(YEN_SIGN)->contains(str)) {
    244 //        return YEN_SIGN;
    245 //    } else {
    246 //        return NONE;
    247 //    }
    248 //}
    249 
    250 
    251 #endif /* #if !UCONFIG_NO_FORMATTING */