tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

characterproperties.cpp (13968B)


      1 // © 2018 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // characterproperties.cpp
      5 // created: 2018sep03 Markus W. Scherer
      6 
      7 #include "unicode/utypes.h"
      8 #include "unicode/localpointer.h"
      9 #include "unicode/uchar.h"
     10 #include "unicode/ucpmap.h"
     11 #include "unicode/ucptrie.h"
     12 #include "unicode/umutablecptrie.h"
     13 #include "unicode/uniset.h"
     14 #include "unicode/uscript.h"
     15 #include "unicode/uset.h"
     16 #include "cmemory.h"
     17 #include "emojiprops.h"
     18 #include "mutex.h"
     19 #include "normalizer2impl.h"
     20 #include "uassert.h"
     21 #include "ubidi_props.h"
     22 #include "ucase.h"
     23 #include "ucln_cmn.h"
     24 #include "umutex.h"
     25 #include "uprops.h"
     26 
     27 using icu::LocalPointer;
     28 #if !UCONFIG_NO_NORMALIZATION
     29 using icu::Normalizer2Factory;
     30 using icu::Normalizer2Impl;
     31 #endif
     32 using icu::UInitOnce;
     33 using icu::UnicodeSet;
     34 
     35 namespace {
     36 
     37 UBool U_CALLCONV characterproperties_cleanup();
     38 
     39 constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
     40 
     41 struct Inclusion {
     42    UnicodeSet  *fSet = nullptr;
     43    UInitOnce    fInitOnce {};
     44 };
     45 Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
     46 
     47 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
     48 
     49 UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
     50 
     51 icu::UMutex cpMutex;
     52 
     53 //----------------------------------------------------------------
     54 // Inclusions list
     55 //----------------------------------------------------------------
     56 
     57 // USetAdder implementation
     58 // Does not use uset.h to reduce code dependencies
     59 void U_CALLCONV
     60 _set_add(USet *set, UChar32 c) {
     61    reinterpret_cast<UnicodeSet*>(set)->add(c);
     62 }
     63 
     64 void U_CALLCONV
     65 _set_addRange(USet *set, UChar32 start, UChar32 end) {
     66    reinterpret_cast<UnicodeSet*>(set)->add(start, end);
     67 }
     68 
     69 void U_CALLCONV
     70 _set_addString(USet *set, const char16_t *str, int32_t length) {
     71    reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));
     72 }
     73 
     74 UBool U_CALLCONV characterproperties_cleanup() {
     75    for (Inclusion &in: gInclusions) {
     76        delete in.fSet;
     77        in.fSet = nullptr;
     78        in.fInitOnce.reset();
     79    }
     80    for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
     81        delete sets[i];
     82        sets[i] = nullptr;
     83    }
     84    for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
     85        ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
     86        maps[i] = nullptr;
     87    }
     88    return true;
     89 }
     90 
     91 void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
     92    // This function is invoked only via umtx_initOnce().
     93    U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
     94    if (src == UPROPS_SRC_NONE) {
     95        errorCode = U_INTERNAL_PROGRAM_ERROR;
     96        return;
     97    }
     98    U_ASSERT(gInclusions[src].fSet == nullptr);
     99 
    100    LocalPointer<UnicodeSet> incl(new UnicodeSet());
    101    if (incl.isNull()) {
    102        errorCode = U_MEMORY_ALLOCATION_ERROR;
    103        return;
    104    }
    105    USetAdder sa = {
    106        reinterpret_cast<USet*>(incl.getAlias()),
    107        _set_add,
    108        _set_addRange,
    109        _set_addString,
    110        nullptr, // don't need remove()
    111        nullptr // don't need removeRange()
    112    };
    113 
    114    switch(src) {
    115    case UPROPS_SRC_CHAR:
    116        uchar_addPropertyStarts(&sa, &errorCode);
    117        break;
    118    case UPROPS_SRC_PROPSVEC:
    119        upropsvec_addPropertyStarts(&sa, &errorCode);
    120        break;
    121    case UPROPS_SRC_CHAR_AND_PROPSVEC:
    122        uchar_addPropertyStarts(&sa, &errorCode);
    123        upropsvec_addPropertyStarts(&sa, &errorCode);
    124        break;
    125 #if !UCONFIG_NO_NORMALIZATION
    126    case UPROPS_SRC_CASE_AND_NORM: {
    127        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    128        if(U_SUCCESS(errorCode)) {
    129            impl->addPropertyStarts(&sa, errorCode);
    130        }
    131        ucase_addPropertyStarts(&sa, &errorCode);
    132        break;
    133    }
    134    case UPROPS_SRC_NFC: {
    135        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    136        if(U_SUCCESS(errorCode)) {
    137            impl->addPropertyStarts(&sa, errorCode);
    138        }
    139        break;
    140    }
    141    case UPROPS_SRC_NFKC: {
    142        const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
    143        if(U_SUCCESS(errorCode)) {
    144            impl->addPropertyStarts(&sa, errorCode);
    145        }
    146        break;
    147    }
    148    case UPROPS_SRC_NFKC_CF: {
    149        const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
    150        if(U_SUCCESS(errorCode)) {
    151            impl->addPropertyStarts(&sa, errorCode);
    152        }
    153        break;
    154    }
    155    case UPROPS_SRC_NFC_CANON_ITER: {
    156        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    157        if(U_SUCCESS(errorCode)) {
    158            impl->addCanonIterPropertyStarts(&sa, errorCode);
    159        }
    160        break;
    161    }
    162 #endif
    163    case UPROPS_SRC_CASE:
    164        ucase_addPropertyStarts(&sa, &errorCode);
    165        break;
    166    case UPROPS_SRC_BIDI:
    167        ubidi_addPropertyStarts(&sa, &errorCode);
    168        break;
    169    case UPROPS_SRC_INPC:
    170    case UPROPS_SRC_INSC:
    171    case UPROPS_SRC_VO:
    172        uprops_addPropertyStarts(src, &sa, &errorCode);
    173        break;
    174    case UPROPS_SRC_EMOJI: {
    175        const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
    176        if (U_SUCCESS(errorCode)) {
    177            ep->addPropertyStarts(&sa, errorCode);
    178        }
    179        break;
    180    }
    181    case UPROPS_SRC_IDSU:
    182        // New in Unicode 15.1 for just two characters.
    183        sa.add(sa.set, 0x2FFE);
    184        sa.add(sa.set, 0x2FFF + 1);
    185        break;
    186    case UPROPS_SRC_ID_COMPAT_MATH:
    187    case UPROPS_SRC_MCM:
    188        uprops_addPropertyStarts(src, &sa, &errorCode);
    189        break;
    190    case UPROPS_SRC_BLOCK:
    191        ublock_addPropertyStarts(&sa, errorCode);
    192        break;
    193    default:
    194        errorCode = U_INTERNAL_PROGRAM_ERROR;
    195        break;
    196    }
    197 
    198    if (U_FAILURE(errorCode)) {
    199        return;
    200    }
    201    if (incl->isBogus()) {
    202        errorCode = U_MEMORY_ALLOCATION_ERROR;
    203        return;
    204    }
    205    // Compact for caching.
    206    incl->compact();
    207    gInclusions[src].fSet = incl.orphan();
    208    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
    209 }
    210 
    211 const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
    212    if (U_FAILURE(errorCode)) { return nullptr; }
    213    if (src < 0 || UPROPS_SRC_COUNT <= src) {
    214        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    215        return nullptr;
    216    }
    217    Inclusion &i = gInclusions[src];
    218    umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
    219    return i.fSet;
    220 }
    221 
    222 void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
    223    // This function is invoked only via umtx_initOnce().
    224    U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
    225    int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
    226    U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
    227    UPropertySource src = uprops_getSource(prop);
    228    const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
    229    if (U_FAILURE(errorCode)) {
    230        return;
    231    }
    232 
    233    LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
    234    if (intPropIncl.isNull()) {
    235        errorCode = U_MEMORY_ALLOCATION_ERROR;
    236        return;
    237    }
    238    int32_t numRanges = incl->getRangeCount();
    239    int32_t prevValue = 0;
    240    for (int32_t i = 0; i < numRanges; ++i) {
    241        UChar32 rangeEnd = incl->getRangeEnd(i);
    242        for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
    243            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
    244            int32_t value = u_getIntPropertyValue(c, prop);
    245            if (value != prevValue) {
    246                intPropIncl->add(c);
    247                prevValue = value;
    248            }
    249        }
    250    }
    251 
    252    if (intPropIncl->isBogus()) {
    253        errorCode = U_MEMORY_ALLOCATION_ERROR;
    254        return;
    255    }
    256    // Compact for caching.
    257    intPropIncl->compact();
    258    gInclusions[inclIndex].fSet = intPropIncl.orphan();
    259    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
    260 }
    261 
    262 }  // namespace
    263 
    264 U_NAMESPACE_BEGIN
    265 
    266 const UnicodeSet *CharacterProperties::getInclusionsForProperty(
    267        UProperty prop, UErrorCode &errorCode) {
    268    if (U_FAILURE(errorCode)) { return nullptr; }
    269    if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
    270        int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
    271        Inclusion &i = gInclusions[inclIndex];
    272        umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
    273        return i.fSet;
    274    } else {
    275        UPropertySource src = uprops_getSource(prop);
    276        return getInclusionsForSource(src, errorCode);
    277    }
    278 }
    279 
    280 U_NAMESPACE_END
    281 
    282 namespace {
    283 
    284 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
    285    if (U_FAILURE(errorCode)) { return nullptr; }
    286    LocalPointer<UnicodeSet> set(new UnicodeSet());
    287    if (set.isNull()) {
    288        errorCode = U_MEMORY_ALLOCATION_ERROR;
    289        return nullptr;
    290    }
    291    if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
    292        // property of strings
    293        const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
    294        if (U_FAILURE(errorCode)) { return nullptr; }
    295        USetAdder sa = {
    296            reinterpret_cast<USet*>(set.getAlias()),
    297            _set_add,
    298            _set_addRange,
    299            _set_addString,
    300            nullptr, // don't need remove()
    301            nullptr // don't need removeRange()
    302        };
    303        ep->addStrings(&sa, property, errorCode);
    304        if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
    305            // property of _only_ strings
    306            set->freeze();
    307            return set.orphan();
    308        }
    309    }
    310 
    311    const UnicodeSet *inclusions =
    312        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
    313    if (U_FAILURE(errorCode)) { return nullptr; }
    314    int32_t numRanges = inclusions->getRangeCount();
    315    UChar32 startHasProperty = -1;
    316 
    317    for (int32_t i = 0; i < numRanges; ++i) {
    318        UChar32 rangeEnd = inclusions->getRangeEnd(i);
    319        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
    320            // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
    321            if (u_hasBinaryProperty(c, property)) {
    322                if (startHasProperty < 0) {
    323                    // Transition from false to true.
    324                    startHasProperty = c;
    325                }
    326            } else if (startHasProperty >= 0) {
    327                // Transition from true to false.
    328                set->add(startHasProperty, c - 1);
    329                startHasProperty = -1;
    330            }
    331        }
    332    }
    333    if (startHasProperty >= 0) {
    334        set->add(startHasProperty, 0x10FFFF);
    335    }
    336    set->freeze();
    337    return set.orphan();
    338 }
    339 
    340 UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
    341    if (U_FAILURE(errorCode)) { return nullptr; }
    342    uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
    343    icu::LocalUMutableCPTriePointer mutableTrie(
    344        umutablecptrie_open(nullValue, nullValue, &errorCode));
    345    const UnicodeSet *inclusions =
    346        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
    347    if (U_FAILURE(errorCode)) { return nullptr; }
    348    int32_t numRanges = inclusions->getRangeCount();
    349    UChar32 start = 0;
    350    uint32_t value = nullValue;
    351 
    352    for (int32_t i = 0; i < numRanges; ++i) {
    353        UChar32 rangeEnd = inclusions->getRangeEnd(i);
    354        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
    355            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
    356            uint32_t nextValue = u_getIntPropertyValue(c, property);
    357            if (value != nextValue) {
    358                if (value != nullValue) {
    359                    umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
    360                }
    361                start = c;
    362                value = nextValue;
    363            }
    364        }
    365    }
    366    if (value != 0) {
    367        umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
    368    }
    369 
    370    UCPTrieType type;
    371    if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
    372        type = UCPTRIE_TYPE_FAST;
    373    } else {
    374        type = UCPTRIE_TYPE_SMALL;
    375    }
    376    UCPTrieValueWidth valueWidth;
    377    // TODO: UCharacterProperty.IntProperty
    378    int32_t max = u_getIntPropertyMaxValue(property);
    379    if (max <= 0xff) {
    380        valueWidth = UCPTRIE_VALUE_BITS_8;
    381    } else if (max <= 0xffff) {
    382        valueWidth = UCPTRIE_VALUE_BITS_16;
    383    } else {
    384        valueWidth = UCPTRIE_VALUE_BITS_32;
    385    }
    386    return reinterpret_cast<UCPMap *>(
    387        umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
    388 }
    389 
    390 }  // namespace
    391 
    392 U_NAMESPACE_BEGIN
    393 
    394 const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
    395    if (U_FAILURE(errorCode)) { return nullptr; }
    396    if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
    397        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    398        return nullptr;
    399    }
    400    Mutex m(&cpMutex);
    401    UnicodeSet *set = sets[property];
    402    if (set == nullptr) {
    403        sets[property] = set = makeSet(property, errorCode);
    404    }
    405    return set;
    406 }
    407 
    408 U_NAMESPACE_END
    409 
    410 U_NAMESPACE_USE
    411 
    412 U_CAPI const USet * U_EXPORT2
    413 u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
    414    const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
    415    return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
    416 }
    417 
    418 U_CAPI const UCPMap * U_EXPORT2
    419 u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
    420    if (U_FAILURE(*pErrorCode)) { return nullptr; }
    421    if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
    422        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    423        return nullptr;
    424    }
    425    Mutex m(&cpMutex);
    426    UCPMap *map = maps[property - UCHAR_INT_START];
    427    if (map == nullptr) {
    428        maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
    429    }
    430    return map;
    431 }