tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

locdistance.cpp (17141B)


      1 // © 2019 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // locdistance.cpp
      5 // created: 2019may08 Markus W. Scherer
      6 
      7 #include "unicode/utypes.h"
      8 #include "unicode/bytestrie.h"
      9 #include "unicode/localematcher.h"
     10 #include "unicode/locid.h"
     11 #include "unicode/uobject.h"
     12 #include "unicode/ures.h"
     13 #include "cstring.h"
     14 #include "locdistance.h"
     15 #include "loclikelysubtags.h"
     16 #include "uassert.h"
     17 #include "ucln_cmn.h"
     18 #include "uinvchar.h"
     19 #include "umutex.h"
     20 
     21 U_NAMESPACE_BEGIN
     22 
     23 namespace {
     24 
     25 /**
     26 * Bit flag used on the last character of a subtag in the trie.
     27 * Must be set consistently by the builder and the lookup code.
     28 */
     29 constexpr int32_t END_OF_SUBTAG = 0x80;
     30 /** Distance value bit flag, set by the builder. */
     31 constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
     32 /** Distance value bit flag, set by trieNext(). */
     33 constexpr int32_t DISTANCE_IS_FINAL = 0x100;
     34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
     35 
     36 constexpr int32_t ABOVE_THRESHOLD = 100;
     37 
     38 // Indexes into array of distances.
     39 enum {
     40    IX_DEF_LANG_DISTANCE,
     41    IX_DEF_SCRIPT_DISTANCE,
     42    IX_DEF_REGION_DISTANCE,
     43    IX_MIN_REGION_DISTANCE,
     44    IX_LIMIT
     45 };
     46 
     47 LocaleDistance *gLocaleDistance = nullptr;
     48 UInitOnce gInitOnce {};
     49 
     50 UBool U_CALLCONV cleanup() {
     51    delete gLocaleDistance;
     52    gLocaleDistance = nullptr;
     53    gInitOnce.reset();
     54    return true;
     55 }
     56 
     57 }  // namespace
     58 
     59 void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
     60    // This function is invoked only via umtx_initOnce().
     61    U_ASSERT(gLocaleDistance == nullptr);
     62    const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode);
     63    if (U_FAILURE(errorCode)) { return; }
     64    const LocaleDistanceData &data = likely.getDistanceData();
     65    if (data.distanceTrieBytes == nullptr ||
     66            data.regionToPartitions == nullptr || data.partitions == nullptr ||
     67            // ok if no paradigms
     68            data.distances == nullptr) {
     69        errorCode = U_MISSING_RESOURCE_ERROR;
     70        return;
     71    }
     72    gLocaleDistance = new LocaleDistance(data, likely);
     73    if (gLocaleDistance == nullptr) {
     74        errorCode = U_MEMORY_ALLOCATION_ERROR;
     75        return;
     76    }
     77    ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
     78 }
     79 
     80 const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
     81    if (U_FAILURE(errorCode)) { return nullptr; }
     82    umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
     83    return gLocaleDistance;
     84 }
     85 
     86 LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) :
     87        likelySubtags(likely),
     88        trie(data.distanceTrieBytes),
     89        regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
     90        paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
     91        defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
     92        defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
     93        defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
     94        minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
     95    // For the default demotion value, use the
     96    // default region distance between unrelated Englishes.
     97    // Thus, unless demotion is turned off,
     98    // a mere region difference for one desired locale
     99    // is as good as a perfect match for the next following desired locale.
    100    // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
    101    LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
    102    LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
    103    const LSR *p_enGB = &enGB;
    104    int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
    105            shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
    106    defaultDemotionPerDesiredLocale  = getDistanceFloor(indexAndDistance);
    107 }
    108 
    109 int32_t LocaleDistance::getBestIndexAndDistance(
    110        const LSR &desired,
    111        const LSR **supportedLSRs, int32_t supportedLSRsLength,
    112        int32_t shiftedThreshold,
    113        ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
    114    BytesTrie iter(trie);
    115    // Look up the desired language only once for all supported LSRs.
    116    // Its "distance" is either a match point value of 0, or a non-match negative value.
    117    // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
    118    int32_t desLangDistance = trieNext(iter, desired.language, false);
    119    uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
    120    // Index of the supported LSR with the lowest distance.
    121    int32_t bestIndex = -1;
    122    // Cached lookup info from LikelySubtags.compareLikely().
    123    int32_t bestLikelyInfo = -1;
    124    for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
    125        const LSR &supported = *supportedLSRs[slIndex];
    126        bool star = false;
    127        int32_t distance = desLangDistance;
    128        if (distance >= 0) {
    129            U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
    130            if (slIndex != 0) {
    131                iter.resetToState64(desLangState);
    132            }
    133            distance = trieNext(iter, supported.language, true);
    134        }
    135        // Note: The data builder verifies that there are no rules with "any" (*) language and
    136        // real (non *) script or region subtags.
    137        // This means that if the lookup for either language fails we can use
    138        // the default distances without further lookups.
    139        int32_t flags;
    140        if (distance >= 0) {
    141            flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
    142            distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
    143        } else {  // <*, *>
    144            if (uprv_strcmp(desired.language, supported.language) == 0) {
    145                distance = 0;
    146            } else {
    147                distance = defaultLanguageDistance;
    148            }
    149            flags = 0;
    150            star = true;
    151        }
    152        U_ASSERT(0 <= distance && distance <= 100);
    153        // Round up the shifted threshold (if fraction bits are not 0)
    154        // for comparison with un-shifted distances until we need fraction bits.
    155        // (If we simply shifted non-zero fraction bits away, then we might ignore a language
    156        // when it's really still a micro distance below the threshold.)
    157        int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
    158        // We implement "favor subtag" by reducing the language subtag distance
    159        // (unscientifically reducing it to a quarter of the normal value),
    160        // so that the script distance is relatively more important.
    161        // For example, given a default language distance of 80, we reduce it to 20,
    162        // which is below the default threshold of 50, which is the default script distance.
    163        if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
    164            distance >>= 2;
    165        }
    166        // Let distance == roundedThreshold pass until the tie-breaker logic
    167        // at the end of the loop.
    168        if (distance > roundedThreshold) {
    169            continue;
    170        }
    171 
    172        int32_t scriptDistance;
    173        if (star || flags != 0) {
    174            if (uprv_strcmp(desired.script, supported.script) == 0) {
    175                scriptDistance = 0;
    176            } else {
    177                scriptDistance = defaultScriptDistance;
    178            }
    179        } else {
    180            scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
    181                    desired.script, supported.script);
    182            flags = scriptDistance & DISTANCE_IS_FINAL;
    183            scriptDistance &= ~DISTANCE_IS_FINAL;
    184        }
    185        distance += scriptDistance;
    186        if (distance > roundedThreshold) {
    187            continue;
    188        }
    189 
    190        if (uprv_strcmp(desired.region, supported.region) == 0) {
    191            // regionDistance = 0
    192        } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
    193            distance += defaultRegionDistance;
    194        } else {
    195            int32_t remainingThreshold = roundedThreshold - distance;
    196            if (minRegionDistance > remainingThreshold) {
    197                continue;
    198            }
    199 
    200            // From here on we know the regions are not equal.
    201            // Map each region to zero or more partitions. (zero = one non-matching string)
    202            // (Each array of single-character partition strings is encoded as one string.)
    203            // If either side has more than one, then we find the maximum distance.
    204            // This could be optimized by adding some more structure, but probably not worth it.
    205            distance += getRegionPartitionsDistance(
    206                    iter, iter.getState64(),
    207                    partitionsForRegion(desired),
    208                    partitionsForRegion(supported),
    209                    remainingThreshold);
    210        }
    211        int32_t shiftedDistance = shiftDistance(distance);
    212        if (shiftedDistance == 0) {
    213            // Distinguish between equivalent but originally unequal locales via an
    214            // additional micro distance.
    215            shiftedDistance |= (desired.flags ^ supported.flags);
    216            if (shiftedDistance < shiftedThreshold) {
    217                if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
    218                        // Is there also a match when we swap desired/supported?
    219                        isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
    220                    if (shiftedDistance == 0) {
    221                        return slIndex << INDEX_SHIFT;
    222                    }
    223                    bestIndex = slIndex;
    224                    shiftedThreshold = shiftedDistance;
    225                    bestLikelyInfo = -1;
    226                }
    227            }
    228        } else {
    229            if (shiftedDistance < shiftedThreshold) {
    230                if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
    231                        // Is there also a match when we swap desired/supported?
    232                        isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
    233                    bestIndex = slIndex;
    234                    shiftedThreshold = shiftedDistance;
    235                    bestLikelyInfo = -1;
    236                }
    237            } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
    238                if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
    239                        // Is there also a match when we swap desired/supported?
    240                        isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
    241                    bestLikelyInfo = likelySubtags.compareLikely(
    242                            supported, *supportedLSRs[bestIndex], bestLikelyInfo);
    243                    if ((bestLikelyInfo & 1) != 0) {
    244                        // This supported locale matches as well as the previous best match,
    245                        // and neither matches perfectly,
    246                        // but this one is "more likely" (has more-default subtags).
    247                        bestIndex = slIndex;
    248                    }
    249                }
    250            }
    251        }
    252    }
    253    return bestIndex >= 0 ?
    254            (bestIndex << INDEX_SHIFT) | shiftedThreshold :
    255            INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
    256 }
    257 
    258 int32_t LocaleDistance::getDesSuppScriptDistance(
    259        BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
    260    // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
    261    int32_t distance = trieNext(iter, desired, false);
    262    if (distance >= 0) {
    263        distance = trieNext(iter, supported, true);
    264    }
    265    if (distance < 0) {
    266        UStringTrieResult result = iter.resetToState64(startState).next(u'*');  // <*, *>
    267        U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
    268        if (uprv_strcmp(desired, supported) == 0) {
    269            distance = 0;  // same script
    270        } else {
    271            distance = iter.getValue();
    272            U_ASSERT(distance >= 0);
    273        }
    274        if (result == USTRINGTRIE_FINAL_VALUE) {
    275            distance |= DISTANCE_IS_FINAL;
    276        }
    277    }
    278    return distance;
    279 }
    280 
    281 int32_t LocaleDistance::getRegionPartitionsDistance(
    282        BytesTrie &iter, uint64_t startState,
    283        const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
    284    char desired = *desiredPartitions++;
    285    char supported = *supportedPartitions++;
    286    U_ASSERT(desired != 0 && supported != 0);
    287    // See if we have single desired/supported partitions, from NUL-terminated
    288    // partition strings without explicit length.
    289    bool suppLengthGt1 = *supportedPartitions != 0;  // gt1: more than 1 character
    290    // equivalent to: if (desLength == 1 && suppLength == 1)
    291    if (*desiredPartitions == 0 && !suppLengthGt1) {
    292        // Fastpath for single desired/supported partitions.
    293        UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
    294        if (USTRINGTRIE_HAS_NEXT(result)) {
    295            result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
    296            if (USTRINGTRIE_HAS_VALUE(result)) {
    297                return iter.getValue();
    298            }
    299        }
    300        return getFallbackRegionDistance(iter, startState);
    301    }
    302 
    303    const char *supportedStart = supportedPartitions - 1;  // for restart of inner loop
    304    int32_t regionDistance = 0;
    305    // Fall back to * only once, not for each pair of partition strings.
    306    bool star = false;
    307    for (;;) {
    308        // Look up each desired-partition string only once,
    309        // not for each (desired, supported) pair.
    310        UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
    311        if (USTRINGTRIE_HAS_NEXT(result)) {
    312            uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
    313            for (;;) {
    314                result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
    315                int32_t d;
    316                if (USTRINGTRIE_HAS_VALUE(result)) {
    317                    d = iter.getValue();
    318                } else if (star) {
    319                    d = 0;
    320                } else {
    321                    d = getFallbackRegionDistance(iter, startState);
    322                    star = true;
    323                }
    324                if (d > threshold) {
    325                    return d;
    326                } else if (regionDistance < d) {
    327                    regionDistance = d;
    328                }
    329                if ((supported = *supportedPartitions++) != 0) {
    330                    iter.resetToState64(desState);
    331                } else {
    332                    break;
    333                }
    334            }
    335        } else if (!star) {
    336            int32_t d = getFallbackRegionDistance(iter, startState);
    337            if (d > threshold) {
    338                return d;
    339            } else if (regionDistance < d) {
    340                regionDistance = d;
    341            }
    342            star = true;
    343        }
    344        if ((desired = *desiredPartitions++) != 0) {
    345            iter.resetToState64(startState);
    346            supportedPartitions = supportedStart;
    347            supported = *supportedPartitions++;
    348        } else {
    349            break;
    350        }
    351    }
    352    return regionDistance;
    353 }
    354 
    355 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
    356 #if U_DEBUG
    357    UStringTrieResult result =
    358 #endif
    359    iter.resetToState64(startState).next(u'*');  // <*, *>
    360    U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
    361    int32_t distance = iter.getValue();
    362    U_ASSERT(distance >= 0);
    363    return distance;
    364 }
    365 
    366 int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
    367    uint8_t c;
    368    if ((c = *s) == 0) {
    369        return -1;  // no empty subtags in the distance data
    370    }
    371    for (;;) {
    372        c = uprv_invCharToAscii(c);
    373        // EBCDIC: If *s is not an invariant character,
    374        // then c is now 0 and will simply not match anything, which is harmless.
    375        uint8_t next = *++s;
    376        if (next != 0) {
    377            if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
    378                return -1;
    379            }
    380        } else {
    381            // last character of this subtag
    382            UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
    383            if (wantValue) {
    384                if (USTRINGTRIE_HAS_VALUE(result)) {
    385                    int32_t value = iter.getValue();
    386                    if (result == USTRINGTRIE_FINAL_VALUE) {
    387                        value |= DISTANCE_IS_FINAL;
    388                    }
    389                    return value;
    390                }
    391            } else {
    392                if (USTRINGTRIE_HAS_NEXT(result)) {
    393                    return 0;
    394                }
    395            }
    396            return -1;
    397        }
    398        c = next;
    399    }
    400 }
    401 
    402 bool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
    403    // Linear search for a very short list (length 6 as of 2019),
    404    // because we look for equivalence not equality, and
    405    // because it's easy.
    406    // If there are many paradigm LSRs we should use a hash set
    407    // with custom comparator and hasher.
    408    U_ASSERT(paradigmLSRsLength <= 15);
    409    for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
    410        if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
    411    }
    412    return false;
    413 }
    414 
    415 U_NAMESPACE_END