locdistance.cpp (17141B)
1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // locdistance.cpp 5 // created: 2019may08 Markus W. Scherer 6 7 #include "unicode/utypes.h" 8 #include "unicode/bytestrie.h" 9 #include "unicode/localematcher.h" 10 #include "unicode/locid.h" 11 #include "unicode/uobject.h" 12 #include "unicode/ures.h" 13 #include "cstring.h" 14 #include "locdistance.h" 15 #include "loclikelysubtags.h" 16 #include "uassert.h" 17 #include "ucln_cmn.h" 18 #include "uinvchar.h" 19 #include "umutex.h" 20 21 U_NAMESPACE_BEGIN 22 23 namespace { 24 25 /** 26 * Bit flag used on the last character of a subtag in the trie. 27 * Must be set consistently by the builder and the lookup code. 28 */ 29 constexpr int32_t END_OF_SUBTAG = 0x80; 30 /** Distance value bit flag, set by the builder. */ 31 constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; 32 /** Distance value bit flag, set by trieNext(). */ 33 constexpr int32_t DISTANCE_IS_FINAL = 0x100; 34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; 35 36 constexpr int32_t ABOVE_THRESHOLD = 100; 37 38 // Indexes into array of distances. 39 enum { 40 IX_DEF_LANG_DISTANCE, 41 IX_DEF_SCRIPT_DISTANCE, 42 IX_DEF_REGION_DISTANCE, 43 IX_MIN_REGION_DISTANCE, 44 IX_LIMIT 45 }; 46 47 LocaleDistance *gLocaleDistance = nullptr; 48 UInitOnce gInitOnce {}; 49 50 UBool U_CALLCONV cleanup() { 51 delete gLocaleDistance; 52 gLocaleDistance = nullptr; 53 gInitOnce.reset(); 54 return true; 55 } 56 57 } // namespace 58 59 void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { 60 // This function is invoked only via umtx_initOnce(). 61 U_ASSERT(gLocaleDistance == nullptr); 62 const LikelySubtags &likely = *LikelySubtags::getSingleton(errorCode); 63 if (U_FAILURE(errorCode)) { return; } 64 const LocaleDistanceData &data = likely.getDistanceData(); 65 if (data.distanceTrieBytes == nullptr || 66 data.regionToPartitions == nullptr || data.partitions == nullptr || 67 // ok if no paradigms 68 data.distances == nullptr) { 69 errorCode = U_MISSING_RESOURCE_ERROR; 70 return; 71 } 72 gLocaleDistance = new LocaleDistance(data, likely); 73 if (gLocaleDistance == nullptr) { 74 errorCode = U_MEMORY_ALLOCATION_ERROR; 75 return; 76 } 77 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); 78 } 79 80 const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { 81 if (U_FAILURE(errorCode)) { return nullptr; } 82 umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); 83 return gLocaleDistance; 84 } 85 86 LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const LikelySubtags &likely) : 87 likelySubtags(likely), 88 trie(data.distanceTrieBytes), 89 regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), 90 paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), 91 defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), 92 defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), 93 defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), 94 minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { 95 // For the default demotion value, use the 96 // default region distance between unrelated Englishes. 97 // Thus, unless demotion is turned off, 98 // a mere region difference for one desired locale 99 // is as good as a perfect match for the next following desired locale. 100 // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. 101 LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); 102 LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); 103 const LSR *p_enGB = &enGB; 104 int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, 105 shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); 106 defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); 107 } 108 109 int32_t LocaleDistance::getBestIndexAndDistance( 110 const LSR &desired, 111 const LSR **supportedLSRs, int32_t supportedLSRsLength, 112 int32_t shiftedThreshold, 113 ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { 114 BytesTrie iter(trie); 115 // Look up the desired language only once for all supported LSRs. 116 // Its "distance" is either a match point value of 0, or a non-match negative value. 117 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. 118 int32_t desLangDistance = trieNext(iter, desired.language, false); 119 uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; 120 // Index of the supported LSR with the lowest distance. 121 int32_t bestIndex = -1; 122 // Cached lookup info from LikelySubtags.compareLikely(). 123 int32_t bestLikelyInfo = -1; 124 for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { 125 const LSR &supported = *supportedLSRs[slIndex]; 126 bool star = false; 127 int32_t distance = desLangDistance; 128 if (distance >= 0) { 129 U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); 130 if (slIndex != 0) { 131 iter.resetToState64(desLangState); 132 } 133 distance = trieNext(iter, supported.language, true); 134 } 135 // Note: The data builder verifies that there are no rules with "any" (*) language and 136 // real (non *) script or region subtags. 137 // This means that if the lookup for either language fails we can use 138 // the default distances without further lookups. 139 int32_t flags; 140 if (distance >= 0) { 141 flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; 142 distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; 143 } else { // <*, *> 144 if (uprv_strcmp(desired.language, supported.language) == 0) { 145 distance = 0; 146 } else { 147 distance = defaultLanguageDistance; 148 } 149 flags = 0; 150 star = true; 151 } 152 U_ASSERT(0 <= distance && distance <= 100); 153 // Round up the shifted threshold (if fraction bits are not 0) 154 // for comparison with un-shifted distances until we need fraction bits. 155 // (If we simply shifted non-zero fraction bits away, then we might ignore a language 156 // when it's really still a micro distance below the threshold.) 157 int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; 158 // We implement "favor subtag" by reducing the language subtag distance 159 // (unscientifically reducing it to a quarter of the normal value), 160 // so that the script distance is relatively more important. 161 // For example, given a default language distance of 80, we reduce it to 20, 162 // which is below the default threshold of 50, which is the default script distance. 163 if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { 164 distance >>= 2; 165 } 166 // Let distance == roundedThreshold pass until the tie-breaker logic 167 // at the end of the loop. 168 if (distance > roundedThreshold) { 169 continue; 170 } 171 172 int32_t scriptDistance; 173 if (star || flags != 0) { 174 if (uprv_strcmp(desired.script, supported.script) == 0) { 175 scriptDistance = 0; 176 } else { 177 scriptDistance = defaultScriptDistance; 178 } 179 } else { 180 scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), 181 desired.script, supported.script); 182 flags = scriptDistance & DISTANCE_IS_FINAL; 183 scriptDistance &= ~DISTANCE_IS_FINAL; 184 } 185 distance += scriptDistance; 186 if (distance > roundedThreshold) { 187 continue; 188 } 189 190 if (uprv_strcmp(desired.region, supported.region) == 0) { 191 // regionDistance = 0 192 } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { 193 distance += defaultRegionDistance; 194 } else { 195 int32_t remainingThreshold = roundedThreshold - distance; 196 if (minRegionDistance > remainingThreshold) { 197 continue; 198 } 199 200 // From here on we know the regions are not equal. 201 // Map each region to zero or more partitions. (zero = one non-matching string) 202 // (Each array of single-character partition strings is encoded as one string.) 203 // If either side has more than one, then we find the maximum distance. 204 // This could be optimized by adding some more structure, but probably not worth it. 205 distance += getRegionPartitionsDistance( 206 iter, iter.getState64(), 207 partitionsForRegion(desired), 208 partitionsForRegion(supported), 209 remainingThreshold); 210 } 211 int32_t shiftedDistance = shiftDistance(distance); 212 if (shiftedDistance == 0) { 213 // Distinguish between equivalent but originally unequal locales via an 214 // additional micro distance. 215 shiftedDistance |= (desired.flags ^ supported.flags); 216 if (shiftedDistance < shiftedThreshold) { 217 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || 218 // Is there also a match when we swap desired/supported? 219 isMatch(supported, desired, shiftedThreshold, favorSubtag)) { 220 if (shiftedDistance == 0) { 221 return slIndex << INDEX_SHIFT; 222 } 223 bestIndex = slIndex; 224 shiftedThreshold = shiftedDistance; 225 bestLikelyInfo = -1; 226 } 227 } 228 } else { 229 if (shiftedDistance < shiftedThreshold) { 230 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || 231 // Is there also a match when we swap desired/supported? 232 isMatch(supported, desired, shiftedThreshold, favorSubtag)) { 233 bestIndex = slIndex; 234 shiftedThreshold = shiftedDistance; 235 bestLikelyInfo = -1; 236 } 237 } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { 238 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || 239 // Is there also a match when we swap desired/supported? 240 isMatch(supported, desired, shiftedThreshold, favorSubtag)) { 241 bestLikelyInfo = likelySubtags.compareLikely( 242 supported, *supportedLSRs[bestIndex], bestLikelyInfo); 243 if ((bestLikelyInfo & 1) != 0) { 244 // This supported locale matches as well as the previous best match, 245 // and neither matches perfectly, 246 // but this one is "more likely" (has more-default subtags). 247 bestIndex = slIndex; 248 } 249 } 250 } 251 } 252 } 253 return bestIndex >= 0 ? 254 (bestIndex << INDEX_SHIFT) | shiftedThreshold : 255 INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); 256 } 257 258 int32_t LocaleDistance::getDesSuppScriptDistance( 259 BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { 260 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. 261 int32_t distance = trieNext(iter, desired, false); 262 if (distance >= 0) { 263 distance = trieNext(iter, supported, true); 264 } 265 if (distance < 0) { 266 UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> 267 U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); 268 if (uprv_strcmp(desired, supported) == 0) { 269 distance = 0; // same script 270 } else { 271 distance = iter.getValue(); 272 U_ASSERT(distance >= 0); 273 } 274 if (result == USTRINGTRIE_FINAL_VALUE) { 275 distance |= DISTANCE_IS_FINAL; 276 } 277 } 278 return distance; 279 } 280 281 int32_t LocaleDistance::getRegionPartitionsDistance( 282 BytesTrie &iter, uint64_t startState, 283 const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { 284 char desired = *desiredPartitions++; 285 char supported = *supportedPartitions++; 286 U_ASSERT(desired != 0 && supported != 0); 287 // See if we have single desired/supported partitions, from NUL-terminated 288 // partition strings without explicit length. 289 bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character 290 // equivalent to: if (desLength == 1 && suppLength == 1) 291 if (*desiredPartitions == 0 && !suppLengthGt1) { 292 // Fastpath for single desired/supported partitions. 293 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); 294 if (USTRINGTRIE_HAS_NEXT(result)) { 295 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); 296 if (USTRINGTRIE_HAS_VALUE(result)) { 297 return iter.getValue(); 298 } 299 } 300 return getFallbackRegionDistance(iter, startState); 301 } 302 303 const char *supportedStart = supportedPartitions - 1; // for restart of inner loop 304 int32_t regionDistance = 0; 305 // Fall back to * only once, not for each pair of partition strings. 306 bool star = false; 307 for (;;) { 308 // Look up each desired-partition string only once, 309 // not for each (desired, supported) pair. 310 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); 311 if (USTRINGTRIE_HAS_NEXT(result)) { 312 uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; 313 for (;;) { 314 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); 315 int32_t d; 316 if (USTRINGTRIE_HAS_VALUE(result)) { 317 d = iter.getValue(); 318 } else if (star) { 319 d = 0; 320 } else { 321 d = getFallbackRegionDistance(iter, startState); 322 star = true; 323 } 324 if (d > threshold) { 325 return d; 326 } else if (regionDistance < d) { 327 regionDistance = d; 328 } 329 if ((supported = *supportedPartitions++) != 0) { 330 iter.resetToState64(desState); 331 } else { 332 break; 333 } 334 } 335 } else if (!star) { 336 int32_t d = getFallbackRegionDistance(iter, startState); 337 if (d > threshold) { 338 return d; 339 } else if (regionDistance < d) { 340 regionDistance = d; 341 } 342 star = true; 343 } 344 if ((desired = *desiredPartitions++) != 0) { 345 iter.resetToState64(startState); 346 supportedPartitions = supportedStart; 347 supported = *supportedPartitions++; 348 } else { 349 break; 350 } 351 } 352 return regionDistance; 353 } 354 355 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { 356 #if U_DEBUG 357 UStringTrieResult result = 358 #endif 359 iter.resetToState64(startState).next(u'*'); // <*, *> 360 U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); 361 int32_t distance = iter.getValue(); 362 U_ASSERT(distance >= 0); 363 return distance; 364 } 365 366 int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { 367 uint8_t c; 368 if ((c = *s) == 0) { 369 return -1; // no empty subtags in the distance data 370 } 371 for (;;) { 372 c = uprv_invCharToAscii(c); 373 // EBCDIC: If *s is not an invariant character, 374 // then c is now 0 and will simply not match anything, which is harmless. 375 uint8_t next = *++s; 376 if (next != 0) { 377 if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { 378 return -1; 379 } 380 } else { 381 // last character of this subtag 382 UStringTrieResult result = iter.next(c | END_OF_SUBTAG); 383 if (wantValue) { 384 if (USTRINGTRIE_HAS_VALUE(result)) { 385 int32_t value = iter.getValue(); 386 if (result == USTRINGTRIE_FINAL_VALUE) { 387 value |= DISTANCE_IS_FINAL; 388 } 389 return value; 390 } 391 } else { 392 if (USTRINGTRIE_HAS_NEXT(result)) { 393 return 0; 394 } 395 } 396 return -1; 397 } 398 c = next; 399 } 400 } 401 402 bool LocaleDistance::isParadigmLSR(const LSR &lsr) const { 403 // Linear search for a very short list (length 6 as of 2019), 404 // because we look for equivalence not equality, and 405 // because it's easy. 406 // If there are many paradigm LSRs we should use a hash set 407 // with custom comparator and hasher. 408 U_ASSERT(paradigmLSRsLength <= 15); 409 for (int32_t i = 0; i < paradigmLSRsLength; ++i) { 410 if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } 411 } 412 return false; 413 } 414 415 U_NAMESPACE_END