locid.cpp (94731B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1997-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * 9 * File locid.cpp 10 * 11 * Created by: Richard Gillam 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 02/11/97 aliu Changed gLocPath to fgDataDirectory and added 17 * methods to get and set it. 18 * 04/02/97 aliu Made operator!= inline; fixed return value 19 * of getName(). 20 * 04/15/97 aliu Cleanup for AIX/Win32. 21 * 04/24/97 aliu Numerous changes per code review. 22 * 08/18/98 stephen Changed getDisplayName() 23 * Added SIMPLIFIED_CHINESE, TRADITIONAL_CHINESE 24 * Added getISOCountries(), getISOLanguages(), 25 * getLanguagesForCountry() 26 * 03/16/99 bertrand rehaul. 27 * 07/21/99 stephen Added U_CFUNC setDefault 28 * 11/09/99 weiv Added const char * getName() const; 29 * 04/12/00 srl removing unicodestring api's and cached hash code 30 * 08/10/01 grhoten Change the static Locales to accessor functions 31 ****************************************************************************** 32 */ 33 34 #include <cstddef> 35 #include <optional> 36 #include <string_view> 37 #include <type_traits> 38 #include <utility> 39 40 #include "unicode/bytestream.h" 41 #include "unicode/locid.h" 42 #include "unicode/localebuilder.h" 43 #include "unicode/localpointer.h" 44 #include "unicode/strenum.h" 45 #include "unicode/stringpiece.h" 46 #include "unicode/uloc.h" 47 #include "unicode/ures.h" 48 49 #include "bytesinkutil.h" 50 #include "charstr.h" 51 #include "charstrmap.h" 52 #include "cmemory.h" 53 #include "cstring.h" 54 #include "fixedstring.h" 55 #include "mutex.h" 56 #include "putilimp.h" 57 #include "uassert.h" 58 #include "ucln_cmn.h" 59 #include "uhash.h" 60 #include "ulocimp.h" 61 #include "umutex.h" 62 #include "uniquecharstr.h" 63 #include "ustr_imp.h" 64 #include "uvector.h" 65 66 U_NAMESPACE_BEGIN 67 68 static Locale *gLocaleCache = nullptr; 69 static UInitOnce gLocaleCacheInitOnce {}; 70 71 // gDefaultLocaleMutex protects all access to gDefaultLocalesHashT and gDefaultLocale. 72 static UMutex gDefaultLocaleMutex; 73 static UHashtable *gDefaultLocalesHashT = nullptr; 74 static Locale *gDefaultLocale = nullptr; 75 76 /** 77 * \def ULOC_STRING_LIMIT 78 * strings beyond this value crash in CharString 79 */ 80 #define ULOC_STRING_LIMIT 357913941 81 82 U_NAMESPACE_END 83 84 typedef enum ELocalePos { 85 eENGLISH, 86 eFRENCH, 87 eGERMAN, 88 eITALIAN, 89 eJAPANESE, 90 eKOREAN, 91 eCHINESE, 92 93 eFRANCE, 94 eGERMANY, 95 eITALY, 96 eJAPAN, 97 eKOREA, 98 eCHINA, /* Alias for PRC */ 99 eTAIWAN, 100 eUK, 101 eUS, 102 eCANADA, 103 eCANADA_FRENCH, 104 eROOT, 105 106 107 //eDEFAULT, 108 eMAX_LOCALES 109 } ELocalePos; 110 111 namespace { 112 113 // 114 // Deleter function for Locales owned by the default Locale hash table/ 115 // 116 void U_CALLCONV 117 deleteLocale(void *obj) { 118 delete static_cast<icu::Locale*>(obj); 119 } 120 121 UBool U_CALLCONV locale_cleanup() 122 { 123 U_NAMESPACE_USE 124 125 delete [] gLocaleCache; 126 gLocaleCache = nullptr; 127 gLocaleCacheInitOnce.reset(); 128 129 if (gDefaultLocalesHashT) { 130 uhash_close(gDefaultLocalesHashT); // Automatically deletes all elements, using deleter func. 131 gDefaultLocalesHashT = nullptr; 132 } 133 gDefaultLocale = nullptr; 134 return true; 135 } 136 137 void U_CALLCONV locale_init(UErrorCode &status) { 138 U_NAMESPACE_USE 139 140 U_ASSERT(gLocaleCache == nullptr); 141 gLocaleCache = new Locale[static_cast<int>(eMAX_LOCALES)]; 142 if (gLocaleCache == nullptr) { 143 status = U_MEMORY_ALLOCATION_ERROR; 144 return; 145 } 146 ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup); 147 gLocaleCache[eROOT] = Locale(""); 148 gLocaleCache[eENGLISH] = Locale("en"); 149 gLocaleCache[eFRENCH] = Locale("fr"); 150 gLocaleCache[eGERMAN] = Locale("de"); 151 gLocaleCache[eITALIAN] = Locale("it"); 152 gLocaleCache[eJAPANESE] = Locale("ja"); 153 gLocaleCache[eKOREAN] = Locale("ko"); 154 gLocaleCache[eCHINESE] = Locale("zh"); 155 gLocaleCache[eFRANCE] = Locale("fr", "FR"); 156 gLocaleCache[eGERMANY] = Locale("de", "DE"); 157 gLocaleCache[eITALY] = Locale("it", "IT"); 158 gLocaleCache[eJAPAN] = Locale("ja", "JP"); 159 gLocaleCache[eKOREA] = Locale("ko", "KR"); 160 gLocaleCache[eCHINA] = Locale("zh", "CN"); 161 gLocaleCache[eTAIWAN] = Locale("zh", "TW"); 162 gLocaleCache[eUK] = Locale("en", "GB"); 163 gLocaleCache[eUS] = Locale("en", "US"); 164 gLocaleCache[eCANADA] = Locale("en", "CA"); 165 gLocaleCache[eCANADA_FRENCH] = Locale("fr", "CA"); 166 } 167 168 } // namespace 169 170 U_NAMESPACE_BEGIN 171 172 Locale *locale_set_default_internal(const char *id, UErrorCode& status) { 173 // Synchronize this entire function. 174 Mutex lock(&gDefaultLocaleMutex); 175 176 UBool canonicalize = false; 177 178 // If given a nullptr string for the locale id, grab the default 179 // name from the system. 180 // (Different from most other locale APIs, where a null name means use 181 // the current ICU default locale.) 182 if (id == nullptr) { 183 id = uprv_getDefaultLocaleID(); // This function not thread safe? TODO: verify. 184 canonicalize = true; // always canonicalize host ID 185 } 186 187 CharString localeNameBuf = 188 canonicalize ? ulocimp_canonicalize(id, status) : ulocimp_getName(id, status); 189 190 if (U_FAILURE(status)) { 191 return gDefaultLocale; 192 } 193 194 if (gDefaultLocalesHashT == nullptr) { 195 gDefaultLocalesHashT = uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status); 196 if (U_FAILURE(status)) { 197 return gDefaultLocale; 198 } 199 uhash_setValueDeleter(gDefaultLocalesHashT, deleteLocale); 200 ucln_common_registerCleanup(UCLN_COMMON_LOCALE, locale_cleanup); 201 } 202 203 Locale* newDefault = static_cast<Locale*>(uhash_get(gDefaultLocalesHashT, localeNameBuf.data())); 204 if (newDefault == nullptr) { 205 newDefault = new Locale(Locale::eBOGUS); 206 if (newDefault == nullptr) { 207 status = U_MEMORY_ALLOCATION_ERROR; 208 return gDefaultLocale; 209 } 210 newDefault->init(localeNameBuf.data(), false); 211 uhash_put(gDefaultLocalesHashT, const_cast<char*>(newDefault->getName()), newDefault, &status); 212 if (U_FAILURE(status)) { 213 return gDefaultLocale; 214 } 215 } 216 gDefaultLocale = newDefault; 217 return gDefaultLocale; 218 } 219 220 U_NAMESPACE_END 221 222 /* sfb 07/21/99 */ 223 U_CFUNC void 224 locale_set_default(const char *id) 225 { 226 U_NAMESPACE_USE 227 UErrorCode status = U_ZERO_ERROR; 228 locale_set_default_internal(id, status); 229 } 230 /* end */ 231 232 U_CFUNC const char * 233 locale_get_default() 234 { 235 U_NAMESPACE_USE 236 return Locale::getDefault().getName(); 237 } 238 239 namespace { 240 241 template <auto FIELD, typename T> 242 void copyToArray(std::string_view sv, T* that) { 243 auto& field = that->*FIELD; 244 constexpr size_t capacity = std::extent_v<std::remove_reference_t<decltype(field)>>; 245 static_assert(capacity > 0); 246 if (!sv.empty()) { 247 U_ASSERT(sv.size() < capacity); 248 uprv_memcpy(field, sv.data(), sv.size()); 249 } 250 field[sv.size()] = '\0'; 251 } 252 253 } // namespace 254 255 U_NAMESPACE_BEGIN 256 257 void Locale::Nest::init(std::string_view language, 258 std::string_view script, 259 std::string_view region, 260 uint8_t variantBegin) { 261 copyToArray<&Nest::language>(language, this); 262 copyToArray<&Nest::script>(script, this); 263 copyToArray<&Nest::region>(region, this); 264 this->variantBegin = variantBegin; 265 } 266 267 Locale::Nest::Nest(Heap&& heap, uint8_t variantBegin) { 268 // When moving from Heap to Nest the language field can be left untouched 269 // (as it has the same offset in both) and only the script and region fields 270 // need to be copied to their new locations, which is safe to do because the 271 // new locations come before the old locations in memory and don't overlap. 272 static_assert(offsetof(Nest, region) <= offsetof(Heap, script)); 273 static_assert(offsetof(Nest, variantBegin) <= offsetof(Heap, region)); 274 U_ASSERT(this == reinterpret_cast<Nest*>(&heap)); 275 copyToArray<&Nest::script>(heap.script, this); 276 copyToArray<&Nest::region>(heap.region, this); 277 this->variantBegin = variantBegin; 278 *this->baseName = '\0'; 279 } 280 281 struct Locale::Heap::Alloc : public UMemory { 282 FixedString fullName; 283 FixedString baseName; 284 int32_t variantBegin; 285 286 const char* getVariant() const { return variantBegin == 0 ? "" : getBaseName() + variantBegin; } 287 const char* getFullName() const { return fullName.data(); } 288 const char* getBaseName() const { 289 if (baseName.isEmpty()) { 290 if (const char* name = fullName.data(); *name != '@') { 291 return name; 292 } 293 } 294 return baseName.data(); 295 } 296 297 Alloc(int32_t variantBegin) : fullName(), baseName(), variantBegin(variantBegin) {} 298 299 Alloc(const Alloc& other, UErrorCode& status) 300 : fullName(), baseName(), variantBegin(other.variantBegin) { 301 if (U_SUCCESS(status)) { 302 if (!other.fullName.isEmpty()) { 303 fullName = other.fullName; 304 if (fullName.isEmpty()) { 305 status = U_MEMORY_ALLOCATION_ERROR; 306 } else { 307 if (!other.baseName.isEmpty()) { 308 baseName = other.baseName; 309 if (baseName.isEmpty()) { 310 status = U_MEMORY_ALLOCATION_ERROR; 311 } 312 } 313 } 314 } 315 } 316 } 317 318 // Move should be done on the owner of the pointer to this object. 319 Alloc(Alloc&&) noexcept = delete; 320 321 ~Alloc() = default; 322 }; 323 324 const char* Locale::Heap::getVariant() const { return ptr->getVariant(); } 325 const char* Locale::Heap::getFullName() const { return ptr->getFullName(); } 326 const char* Locale::Heap::getBaseName() const { return ptr->getBaseName(); } 327 328 Locale::Heap::Heap(std::string_view language, 329 std::string_view script, 330 std::string_view region, 331 int32_t variantBegin) { 332 ptr = new Alloc(variantBegin); 333 if (ptr == nullptr) { 334 type = eBOGUS; 335 } else { 336 type = eHEAP; 337 copyToArray<&Heap::language>(language, this); 338 copyToArray<&Heap::script>(script, this); 339 copyToArray<&Heap::region>(region, this); 340 } 341 } 342 343 Locale::Heap::~Heap() { 344 U_ASSERT(type == eHEAP); 345 delete ptr; 346 } 347 348 Locale::Heap& Locale::Heap::operator=(const Heap& other) { 349 U_ASSERT(type == eBOGUS); 350 UErrorCode status = U_ZERO_ERROR; 351 ptr = new Alloc(*other.ptr, status); 352 if (ptr == nullptr || U_FAILURE(status)) { 353 delete ptr; 354 } else { 355 type = eHEAP; 356 uprv_memcpy(language, other.language, sizeof language); 357 uprv_memcpy(script, other.script, sizeof script); 358 uprv_memcpy(region, other.region, sizeof region); 359 } 360 return *this; 361 } 362 363 Locale::Heap& Locale::Heap::operator=(Heap&& other) noexcept { 364 U_ASSERT(type == eBOGUS); 365 ptr = other.ptr; 366 type = eHEAP; 367 other.type = eBOGUS; 368 uprv_memcpy(language, other.language, sizeof language); 369 uprv_memcpy(script, other.script, sizeof script); 370 uprv_memcpy(region, other.region, sizeof region); 371 return *this; 372 } 373 374 template <typename BogusFn, typename NestFn, typename HeapFn, typename... Args> 375 auto Locale::Payload::visit(BogusFn bogusFn, NestFn nestFn, HeapFn heapFn, Args... args) const { 376 switch (type) { 377 case eBOGUS: 378 return bogusFn(args...); 379 case eNEST: 380 return nestFn(nest, args...); 381 case eHEAP: 382 return heapFn(heap, args...); 383 default: 384 UPRV_UNREACHABLE_EXIT; 385 }; 386 } 387 388 void Locale::Payload::copy(const Payload& other) { 389 other.visit([](Payload*) {}, 390 [](const Nest& nest, Payload* dst) { dst->nest = nest; }, 391 [](const Heap& heap, Payload* dst) { dst->heap = heap; }, 392 this); 393 } 394 395 void Locale::Payload::move(Payload&& other) noexcept { 396 other.visit( 397 [](Payload*) {}, 398 [](const Nest& nest, Payload* dst) { dst->nest = nest; }, 399 [](const Heap& heap, Payload* dst) { dst->heap = std::move(const_cast<Heap&>(heap)); }, 400 this); 401 } 402 403 Locale::Payload::~Payload() { 404 if (type == eHEAP) { heap.~Heap(); } 405 } 406 407 Locale::Payload::Payload(const Payload& other) : type{eBOGUS} { copy(other); } 408 Locale::Payload::Payload(Payload&& other) noexcept : type{eBOGUS} { move(std::move(other)); } 409 410 Locale::Payload& Locale::Payload::operator=(const Payload& other) { 411 if (this != &other) { 412 setToBogus(); 413 copy(other); 414 } 415 return *this; 416 } 417 418 Locale::Payload& Locale::Payload::operator=(Payload&& other) noexcept { 419 if (this != &other) { 420 setToBogus(); 421 move(std::move(other)); 422 } 423 return *this; 424 } 425 426 void Locale::Payload::setToBogus() { 427 this->~Payload(); 428 type = eBOGUS; 429 } 430 431 template <typename T, typename... Args> T& Locale::Payload::emplace(Args&&... args) { 432 if constexpr (std::is_same_v<T, Nest>) { 433 this->~Payload(); 434 ::new (&nest) Nest(std::forward<Args>(args)...); 435 return nest; 436 } 437 if constexpr (std::is_same_v<T, Heap>) { 438 U_ASSERT(type != eHEAP); 439 ::new (&heap) Heap(std::forward<Args>(args)...); 440 return heap; 441 } 442 } 443 444 template <> Locale::Nest* Locale::Payload::get() { return type == eNEST ? &nest : nullptr; } 445 template <> Locale::Heap* Locale::Payload::get() { return type == eHEAP ? &heap : nullptr; } 446 447 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Locale) 448 449 /*Character separating the posix id fields*/ 450 // '_' 451 // In the platform codepage. 452 #define SEP_CHAR '_' 453 #define NULL_CHAR '\0' 454 455 Locale::~Locale() = default; 456 457 Locale::Locale() 458 : UObject(), payload() 459 { 460 init(nullptr, false); 461 } 462 463 /* 464 * Internal constructor to allow construction of a locale object with 465 * NO side effects. (Default constructor tries to get 466 * the default locale.) 467 */ 468 Locale::Locale(Locale::ELocaleType) 469 : UObject(), payload() 470 { 471 } 472 473 474 Locale::Locale( const char * newLanguage, 475 const char * newCountry, 476 const char * newVariant, 477 const char * newKeywords) 478 : UObject(), payload() 479 { 480 if( (newLanguage==nullptr) && (newCountry == nullptr) && (newVariant == nullptr) ) 481 { 482 init(nullptr, false); /* shortcut */ 483 } 484 else 485 { 486 UErrorCode status = U_ZERO_ERROR; 487 int32_t lsize = 0; 488 int32_t csize = 0; 489 int32_t vsize = 0; 490 int32_t ksize = 0; 491 492 // Check the sizes of the input strings. 493 494 // Language 495 if ( newLanguage != nullptr ) 496 { 497 lsize = static_cast<int32_t>(uprv_strlen(newLanguage)); 498 if ( lsize < 0 || lsize > ULOC_STRING_LIMIT ) { // int32 wrap 499 return; 500 } 501 } 502 503 CharString togo(newLanguage, lsize, status); // start with newLanguage 504 505 // _Country 506 if ( newCountry != nullptr ) 507 { 508 csize = static_cast<int32_t>(uprv_strlen(newCountry)); 509 if ( csize < 0 || csize > ULOC_STRING_LIMIT ) { // int32 wrap 510 return; 511 } 512 } 513 514 // _Variant 515 if ( newVariant != nullptr ) 516 { 517 // remove leading _'s 518 while(newVariant[0] == SEP_CHAR) 519 { 520 newVariant++; 521 } 522 523 // remove trailing _'s 524 vsize = static_cast<int32_t>(uprv_strlen(newVariant)); 525 if ( vsize < 0 || vsize > ULOC_STRING_LIMIT ) { // int32 wrap 526 return; 527 } 528 while( (vsize>1) && (newVariant[vsize-1] == SEP_CHAR) ) 529 { 530 vsize--; 531 } 532 } 533 534 if ( newKeywords != nullptr) 535 { 536 ksize = static_cast<int32_t>(uprv_strlen(newKeywords)); 537 if ( ksize < 0 || ksize > ULOC_STRING_LIMIT ) { 538 return; 539 } 540 } 541 542 // We've checked the input sizes, now build up the full locale string.. 543 544 // newLanguage is already copied 545 546 if ( ( vsize != 0 ) || (csize != 0) ) // at least: __v 547 { // ^ 548 togo.append(SEP_CHAR, status); 549 } 550 551 if ( csize != 0 ) 552 { 553 togo.append(newCountry, status); 554 } 555 556 if ( vsize != 0) 557 { 558 togo.append(SEP_CHAR, status) 559 .append(newVariant, vsize, status); 560 } 561 562 if ( ksize != 0) 563 { 564 if (uprv_strchr(newKeywords, '=')) { 565 togo.append('@', status); /* keyword parsing */ 566 } 567 else { 568 togo.append('_', status); /* Variant parsing with a script */ 569 if ( vsize == 0) { 570 togo.append('_', status); /* No country found */ 571 } 572 } 573 togo.append(newKeywords, status); 574 } 575 576 if (U_FAILURE(status)) { 577 // Something went wrong with appending, etc. 578 return; 579 } 580 // Parse it, because for example 'language' might really be a complete 581 // string. 582 init(togo.data(), false); 583 } 584 } 585 586 Locale::Locale(const Locale&) = default; 587 Locale::Locale(Locale&&) noexcept = default; 588 589 Locale& Locale::operator=(const Locale&) = default; 590 Locale& Locale::operator=(Locale&&) noexcept = default; 591 592 Locale * 593 Locale::clone() const { 594 return new Locale(*this); 595 } 596 597 bool 598 Locale::operator==( const Locale& other) const 599 { 600 return uprv_strcmp(other.getName(), getName()) == 0; 601 } 602 603 namespace { 604 605 UInitOnce gKnownCanonicalizedInitOnce {}; 606 UHashtable *gKnownCanonicalized = nullptr; 607 608 constexpr const char* KNOWN_CANONICALIZED[] = { 609 "c", 610 // Commonly used locales known are already canonicalized 611 "af", "af_ZA", "am", "am_ET", "ar", "ar_001", "as", "as_IN", "az", "az_AZ", 612 "be", "be_BY", "bg", "bg_BG", "bn", "bn_IN", "bs", "bs_BA", "ca", "ca_ES", 613 "cs", "cs_CZ", "cy", "cy_GB", "da", "da_DK", "de", "de_DE", "el", "el_GR", 614 "en", "en_GB", "en_US", "es", "es_419", "es_ES", "et", "et_EE", "eu", 615 "eu_ES", "fa", "fa_IR", "fi", "fi_FI", "fil", "fil_PH", "fr", "fr_FR", 616 "ga", "ga_IE", "gl", "gl_ES", "gu", "gu_IN", "he", "he_IL", "hi", "hi_IN", 617 "hr", "hr_HR", "hu", "hu_HU", "hy", "hy_AM", "id", "id_ID", "is", "is_IS", 618 "it", "it_IT", "ja", "ja_JP", "jv", "jv_ID", "ka", "ka_GE", "kk", "kk_KZ", 619 "km", "km_KH", "kn", "kn_IN", "ko", "ko_KR", "ky", "ky_KG", "lo", "lo_LA", 620 "lt", "lt_LT", "lv", "lv_LV", "mk", "mk_MK", "ml", "ml_IN", "mn", "mn_MN", 621 "mr", "mr_IN", "ms", "ms_MY", "my", "my_MM", "nb", "nb_NO", "ne", "ne_NP", 622 "nl", "nl_NL", "no", "or", "or_IN", "pa", "pa_IN", "pl", "pl_PL", "ps", "ps_AF", 623 "pt", "pt_BR", "pt_PT", "ro", "ro_RO", "ru", "ru_RU", "sd", "sd_IN", "si", 624 "si_LK", "sk", "sk_SK", "sl", "sl_SI", "so", "so_SO", "sq", "sq_AL", "sr", 625 "sr_Cyrl_RS", "sr_Latn", "sr_RS", "sv", "sv_SE", "sw", "sw_TZ", "ta", 626 "ta_IN", "te", "te_IN", "th", "th_TH", "tk", "tk_TM", "tr", "tr_TR", "uk", 627 "uk_UA", "ur", "ur_PK", "uz", "uz_UZ", "vi", "vi_VN", "yue", "yue_Hant", 628 "yue_Hant_HK", "yue_HK", "zh", "zh_CN", "zh_Hans", "zh_Hans_CN", "zh_Hant", 629 "zh_Hant_TW", "zh_TW", "zu", "zu_ZA" 630 }; 631 632 UBool U_CALLCONV cleanupKnownCanonicalized() { 633 gKnownCanonicalizedInitOnce.reset(); 634 if (gKnownCanonicalized) { uhash_close(gKnownCanonicalized); } 635 return true; 636 } 637 638 void U_CALLCONV loadKnownCanonicalized(UErrorCode &status) { 639 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_KNOWN_CANONICALIZED, 640 cleanupKnownCanonicalized); 641 LocalUHashtablePointer newKnownCanonicalizedMap( 642 uhash_open(uhash_hashChars, uhash_compareChars, nullptr, &status)); 643 for (int32_t i = 0; 644 U_SUCCESS(status) && i < UPRV_LENGTHOF(KNOWN_CANONICALIZED); 645 i++) { 646 uhash_puti(newKnownCanonicalizedMap.getAlias(), 647 (void*)KNOWN_CANONICALIZED[i], 648 1, &status); 649 } 650 if (U_FAILURE(status)) { 651 return; 652 } 653 654 gKnownCanonicalized = newKnownCanonicalizedMap.orphan(); 655 } 656 657 class AliasData; 658 659 /** 660 * A Builder class to build the alias data. 661 */ 662 class AliasDataBuilder { 663 public: 664 AliasDataBuilder() { 665 } 666 667 // Build the AliasData from resource. 668 AliasData* build(UErrorCode &status); 669 670 private: 671 void readAlias(UResourceBundle* alias, 672 UniqueCharStrings* strings, 673 LocalMemory<const char*>& types, 674 LocalMemory<int32_t>& replacementIndexes, 675 int32_t &length, 676 void (*checkType)(const char* type), 677 void (*checkReplacement)(const UChar* replacement), 678 UErrorCode &status); 679 680 // Read the languageAlias data from alias to 681 // strings+types+replacementIndexes 682 // The number of record will be stored into length. 683 // Allocate length items for types, to store the type field. 684 // Allocate length items for replacementIndexes, 685 // to store the index in the strings for the replacement script. 686 void readLanguageAlias(UResourceBundle* alias, 687 UniqueCharStrings* strings, 688 LocalMemory<const char*>& types, 689 LocalMemory<int32_t>& replacementIndexes, 690 int32_t &length, 691 UErrorCode &status); 692 693 // Read the scriptAlias data from alias to 694 // strings+types+replacementIndexes 695 // Allocate length items for types, to store the type field. 696 // Allocate length items for replacementIndexes, 697 // to store the index in the strings for the replacement script. 698 void readScriptAlias(UResourceBundle* alias, 699 UniqueCharStrings* strings, 700 LocalMemory<const char*>& types, 701 LocalMemory<int32_t>& replacementIndexes, 702 int32_t &length, UErrorCode &status); 703 704 // Read the territoryAlias data from alias to 705 // strings+types+replacementIndexes 706 // Allocate length items for types, to store the type field. 707 // Allocate length items for replacementIndexes, 708 // to store the index in the strings for the replacement script. 709 void readTerritoryAlias(UResourceBundle* alias, 710 UniqueCharStrings* strings, 711 LocalMemory<const char*>& types, 712 LocalMemory<int32_t>& replacementIndexes, 713 int32_t &length, UErrorCode &status); 714 715 // Read the variantAlias data from alias to 716 // strings+types+replacementIndexes 717 // Allocate length items for types, to store the type field. 718 // Allocate length items for replacementIndexes, 719 // to store the index in the strings for the replacement variant. 720 void readVariantAlias(UResourceBundle* alias, 721 UniqueCharStrings* strings, 722 LocalMemory<const char*>& types, 723 LocalMemory<int32_t>& replacementIndexes, 724 int32_t &length, UErrorCode &status); 725 726 // Read the subdivisionAlias data from alias to 727 // strings+types+replacementIndexes 728 // Allocate length items for types, to store the type field. 729 // Allocate length items for replacementIndexes, 730 // to store the index in the strings for the replacement variant. 731 void readSubdivisionAlias(UResourceBundle* alias, 732 UniqueCharStrings* strings, 733 LocalMemory<const char*>& types, 734 LocalMemory<int32_t>& replacementIndexes, 735 int32_t &length, UErrorCode &status); 736 }; 737 738 /** 739 * A class to hold the Alias Data. 740 */ 741 class AliasData : public UMemory { 742 public: 743 static const AliasData* singleton(UErrorCode& status) { 744 if (U_FAILURE(status)) { 745 // Do not get into loadData if the status already has error. 746 return nullptr; 747 } 748 umtx_initOnce(AliasData::gInitOnce, &AliasData::loadData, status); 749 return gSingleton; 750 } 751 752 const CharStringMap& languageMap() const { return language; } 753 const CharStringMap& scriptMap() const { return script; } 754 const CharStringMap& territoryMap() const { return territory; } 755 const CharStringMap& variantMap() const { return variant; } 756 const CharStringMap& subdivisionMap() const { return subdivision; } 757 758 static void U_CALLCONV loadData(UErrorCode &status); 759 static UBool U_CALLCONV cleanup(); 760 761 static UInitOnce gInitOnce; 762 763 private: 764 AliasData(CharStringMap languageMap, 765 CharStringMap scriptMap, 766 CharStringMap territoryMap, 767 CharStringMap variantMap, 768 CharStringMap subdivisionMap, 769 CharString* strings) 770 : language(std::move(languageMap)), 771 script(std::move(scriptMap)), 772 territory(std::move(territoryMap)), 773 variant(std::move(variantMap)), 774 subdivision(std::move(subdivisionMap)), 775 strings(strings) { 776 } 777 778 ~AliasData() { 779 delete strings; 780 } 781 782 static const AliasData* gSingleton; 783 784 CharStringMap language; 785 CharStringMap script; 786 CharStringMap territory; 787 CharStringMap variant; 788 CharStringMap subdivision; 789 CharString* strings; 790 791 friend class AliasDataBuilder; 792 }; 793 794 795 const AliasData* AliasData::gSingleton = nullptr; 796 UInitOnce AliasData::gInitOnce {}; 797 798 UBool U_CALLCONV 799 AliasData::cleanup() 800 { 801 gInitOnce.reset(); 802 delete gSingleton; 803 return true; 804 } 805 806 void 807 AliasDataBuilder::readAlias( 808 UResourceBundle* alias, 809 UniqueCharStrings* strings, 810 LocalMemory<const char*>& types, 811 LocalMemory<int32_t>& replacementIndexes, 812 int32_t &length, 813 void (*checkType)(const char* type), 814 void (*checkReplacement)(const UChar* replacement), 815 UErrorCode &status) { 816 if (U_FAILURE(status)) { 817 return; 818 } 819 length = ures_getSize(alias); 820 const char** rawTypes = types.allocateInsteadAndCopy(length); 821 if (rawTypes == nullptr) { 822 status = U_MEMORY_ALLOCATION_ERROR; 823 return; 824 } 825 int32_t* rawIndexes = replacementIndexes.allocateInsteadAndCopy(length); 826 if (rawIndexes == nullptr) { 827 status = U_MEMORY_ALLOCATION_ERROR; 828 return; 829 } 830 for (int i = 0; U_SUCCESS(status) && ures_hasNext(alias); i++) { 831 LocalUResourceBundlePointer res( 832 ures_getNextResource(alias, nullptr, &status)); 833 const char* aliasFrom = ures_getKey(res.getAlias()); 834 const UChar* aliasTo = 835 ures_getStringByKey(res.getAlias(), "replacement", nullptr, &status); 836 if (U_FAILURE(status)) return; 837 838 checkType(aliasFrom); 839 checkReplacement(aliasTo); 840 841 rawTypes[i] = aliasFrom; 842 rawIndexes[i] = strings->add(aliasTo, status); 843 } 844 } 845 846 /** 847 * Read the languageAlias data from alias to strings+types+replacementIndexes. 848 * Allocate length items for types, to store the type field. Allocate length 849 * items for replacementIndexes, to store the index in the strings for the 850 * replacement language. 851 */ 852 void 853 AliasDataBuilder::readLanguageAlias( 854 UResourceBundle* alias, 855 UniqueCharStrings* strings, 856 LocalMemory<const char*>& types, 857 LocalMemory<int32_t>& replacementIndexes, 858 int32_t &length, 859 UErrorCode &status) 860 { 861 return readAlias( 862 alias, strings, types, replacementIndexes, length, 863 #if U_DEBUG 864 [](const char* type) { 865 // Assert the aliasFrom only contains the following possibilities 866 // language_REGION_variant 867 // language_REGION 868 // language_variant 869 // language 870 // und_variant 871 Locale test(type); 872 // Assert no script in aliasFrom 873 U_ASSERT(test.getScript()[0] == '\0'); 874 // Assert when language is und, no REGION in aliasFrom. 875 U_ASSERT(test.getLanguage()[0] != '\0' || test.getCountry()[0] == '\0'); 876 }, 877 #else 878 [](const char*) {}, 879 #endif 880 [](const UChar*) {}, status); 881 } 882 883 /** 884 * Read the scriptAlias data from alias to strings+types+replacementIndexes. 885 * Allocate length items for types, to store the type field. Allocate length 886 * items for replacementIndexes, to store the index in the strings for the 887 * replacement script. 888 */ 889 void 890 AliasDataBuilder::readScriptAlias( 891 UResourceBundle* alias, 892 UniqueCharStrings* strings, 893 LocalMemory<const char*>& types, 894 LocalMemory<int32_t>& replacementIndexes, 895 int32_t &length, 896 UErrorCode &status) 897 { 898 return readAlias( 899 alias, strings, types, replacementIndexes, length, 900 #if U_DEBUG 901 [](const char* type) { 902 U_ASSERT(uprv_strlen(type) == 4); 903 }, 904 [](const UChar* replacement) { 905 U_ASSERT(u_strlen(replacement) == 4); 906 }, 907 #else 908 [](const char*) {}, 909 [](const UChar*) { }, 910 #endif 911 status); 912 } 913 914 /** 915 * Read the territoryAlias data from alias to strings+types+replacementIndexes. 916 * Allocate length items for types, to store the type field. Allocate length 917 * items for replacementIndexes, to store the index in the strings for the 918 * replacement regions. 919 */ 920 void 921 AliasDataBuilder::readTerritoryAlias( 922 UResourceBundle* alias, 923 UniqueCharStrings* strings, 924 LocalMemory<const char*>& types, 925 LocalMemory<int32_t>& replacementIndexes, 926 int32_t &length, 927 UErrorCode &status) 928 { 929 return readAlias( 930 alias, strings, types, replacementIndexes, length, 931 #if U_DEBUG 932 [](const char* type) { 933 U_ASSERT(uprv_strlen(type) == 2 || uprv_strlen(type) == 3); 934 }, 935 #else 936 [](const char*) {}, 937 #endif 938 [](const UChar*) { }, 939 status); 940 } 941 942 /** 943 * Read the variantAlias data from alias to strings+types+replacementIndexes. 944 * Allocate length items for types, to store the type field. Allocate length 945 * items for replacementIndexes, to store the index in the strings for the 946 * replacement variant. 947 */ 948 void 949 AliasDataBuilder::readVariantAlias( 950 UResourceBundle* alias, 951 UniqueCharStrings* strings, 952 LocalMemory<const char*>& types, 953 LocalMemory<int32_t>& replacementIndexes, 954 int32_t &length, 955 UErrorCode &status) 956 { 957 return readAlias( 958 alias, strings, types, replacementIndexes, length, 959 #if U_DEBUG 960 [](const char* type) { 961 U_ASSERT(uprv_strlen(type) >= 4 && uprv_strlen(type) <= 8); 962 U_ASSERT(uprv_strlen(type) != 4 || 963 (type[0] >= '0' && type[0] <= '9')); 964 }, 965 [](const UChar* replacement) { 966 int32_t len = u_strlen(replacement); 967 U_ASSERT(len >= 4 && len <= 8); 968 U_ASSERT(len != 4 || 969 (*replacement >= u'0' && 970 *replacement <= u'9')); 971 }, 972 #else 973 [](const char*) {}, 974 [](const UChar*) { }, 975 #endif 976 status); 977 } 978 979 /** 980 * Read the subdivisionAlias data from alias to strings+types+replacementIndexes. 981 * Allocate length items for types, to store the type field. Allocate length 982 * items for replacementIndexes, to store the index in the strings for the 983 * replacement regions. 984 */ 985 void 986 AliasDataBuilder::readSubdivisionAlias( 987 UResourceBundle* alias, 988 UniqueCharStrings* strings, 989 LocalMemory<const char*>& types, 990 LocalMemory<int32_t>& replacementIndexes, 991 int32_t &length, 992 UErrorCode &status) 993 { 994 return readAlias( 995 alias, strings, types, replacementIndexes, length, 996 #if U_DEBUG 997 [](const char* type) { 998 U_ASSERT(uprv_strlen(type) >= 3 && uprv_strlen(type) <= 8); 999 }, 1000 #else 1001 [](const char*) {}, 1002 #endif 1003 [](const UChar*) { }, 1004 status); 1005 } 1006 1007 /** 1008 * Initializes the alias data from the ICU resource bundles. The alias data 1009 * contains alias of language, country, script and variants. 1010 * 1011 * If the alias data has already loaded, then this method simply returns without 1012 * doing anything meaningful. 1013 */ 1014 void U_CALLCONV 1015 AliasData::loadData(UErrorCode &status) 1016 { 1017 #ifdef LOCALE_CANONICALIZATION_DEBUG 1018 UDate start = uprv_getRawUTCtime(); 1019 #endif // LOCALE_CANONICALIZATION_DEBUG 1020 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_ALIAS, cleanup); 1021 AliasDataBuilder builder; 1022 gSingleton = builder.build(status); 1023 #ifdef LOCALE_CANONICALIZATION_DEBUG 1024 UDate end = uprv_getRawUTCtime(); 1025 printf("AliasData::loadData took total %f ms\n", end - start); 1026 #endif // LOCALE_CANONICALIZATION_DEBUG 1027 } 1028 1029 /** 1030 * Build the alias data from resources. 1031 */ 1032 AliasData* 1033 AliasDataBuilder::build(UErrorCode &status) { 1034 if (U_FAILURE(status)) { return nullptr; } 1035 1036 LocalUResourceBundlePointer metadata( 1037 ures_openDirect(nullptr, "metadata", &status)); 1038 LocalUResourceBundlePointer metadataAlias( 1039 ures_getByKey(metadata.getAlias(), "alias", nullptr, &status)); 1040 LocalUResourceBundlePointer languageAlias( 1041 ures_getByKey(metadataAlias.getAlias(), "language", nullptr, &status)); 1042 LocalUResourceBundlePointer scriptAlias( 1043 ures_getByKey(metadataAlias.getAlias(), "script", nullptr, &status)); 1044 LocalUResourceBundlePointer territoryAlias( 1045 ures_getByKey(metadataAlias.getAlias(), "territory", nullptr, &status)); 1046 LocalUResourceBundlePointer variantAlias( 1047 ures_getByKey(metadataAlias.getAlias(), "variant", nullptr, &status)); 1048 LocalUResourceBundlePointer subdivisionAlias( 1049 ures_getByKey(metadataAlias.getAlias(), "subdivision", nullptr, &status)); 1050 1051 if (U_FAILURE(status)) { 1052 return nullptr; 1053 } 1054 int32_t languagesLength = 0, scriptLength = 0, territoryLength = 0, 1055 variantLength = 0, subdivisionLength = 0; 1056 1057 // Read the languageAlias into languageTypes, languageReplacementIndexes 1058 // and strings 1059 UniqueCharStrings strings(status); 1060 LocalMemory<const char*> languageTypes; 1061 LocalMemory<int32_t> languageReplacementIndexes; 1062 readLanguageAlias(languageAlias.getAlias(), 1063 &strings, 1064 languageTypes, 1065 languageReplacementIndexes, 1066 languagesLength, 1067 status); 1068 1069 // Read the scriptAlias into scriptTypes, scriptReplacementIndexes 1070 // and strings 1071 LocalMemory<const char*> scriptTypes; 1072 LocalMemory<int32_t> scriptReplacementIndexes; 1073 readScriptAlias(scriptAlias.getAlias(), 1074 &strings, 1075 scriptTypes, 1076 scriptReplacementIndexes, 1077 scriptLength, 1078 status); 1079 1080 // Read the territoryAlias into territoryTypes, territoryReplacementIndexes 1081 // and strings 1082 LocalMemory<const char*> territoryTypes; 1083 LocalMemory<int32_t> territoryReplacementIndexes; 1084 readTerritoryAlias(territoryAlias.getAlias(), 1085 &strings, 1086 territoryTypes, 1087 territoryReplacementIndexes, 1088 territoryLength, status); 1089 1090 // Read the variantAlias into variantTypes, variantReplacementIndexes 1091 // and strings 1092 LocalMemory<const char*> variantTypes; 1093 LocalMemory<int32_t> variantReplacementIndexes; 1094 readVariantAlias(variantAlias.getAlias(), 1095 &strings, 1096 variantTypes, 1097 variantReplacementIndexes, 1098 variantLength, status); 1099 1100 // Read the subdivisionAlias into subdivisionTypes, subdivisionReplacementIndexes 1101 // and strings 1102 LocalMemory<const char*> subdivisionTypes; 1103 LocalMemory<int32_t> subdivisionReplacementIndexes; 1104 readSubdivisionAlias(subdivisionAlias.getAlias(), 1105 &strings, 1106 subdivisionTypes, 1107 subdivisionReplacementIndexes, 1108 subdivisionLength, status); 1109 1110 if (U_FAILURE(status)) { 1111 return nullptr; 1112 } 1113 1114 // We can only use strings after freeze it. 1115 strings.freeze(); 1116 1117 // Build the languageMap from languageTypes & languageReplacementIndexes 1118 CharStringMap languageMap(490, status); 1119 for (int32_t i = 0; U_SUCCESS(status) && i < languagesLength; i++) { 1120 languageMap.put(languageTypes[i], 1121 strings.get(languageReplacementIndexes[i]), 1122 status); 1123 } 1124 1125 // Build the scriptMap from scriptTypes & scriptReplacementIndexes 1126 CharStringMap scriptMap(1, status); 1127 for (int32_t i = 0; U_SUCCESS(status) && i < scriptLength; i++) { 1128 scriptMap.put(scriptTypes[i], 1129 strings.get(scriptReplacementIndexes[i]), 1130 status); 1131 } 1132 1133 // Build the territoryMap from territoryTypes & territoryReplacementIndexes 1134 CharStringMap territoryMap(650, status); 1135 for (int32_t i = 0; U_SUCCESS(status) && i < territoryLength; i++) { 1136 territoryMap.put(territoryTypes[i], 1137 strings.get(territoryReplacementIndexes[i]), 1138 status); 1139 } 1140 1141 // Build the variantMap from variantTypes & variantReplacementIndexes. 1142 CharStringMap variantMap(2, status); 1143 for (int32_t i = 0; U_SUCCESS(status) && i < variantLength; i++) { 1144 variantMap.put(variantTypes[i], 1145 strings.get(variantReplacementIndexes[i]), 1146 status); 1147 } 1148 1149 // Build the subdivisionMap from subdivisionTypes & subdivisionReplacementIndexes. 1150 CharStringMap subdivisionMap(2, status); 1151 for (int32_t i = 0; U_SUCCESS(status) && i < subdivisionLength; i++) { 1152 subdivisionMap.put(subdivisionTypes[i], 1153 strings.get(subdivisionReplacementIndexes[i]), 1154 status); 1155 } 1156 1157 if (U_FAILURE(status)) { 1158 return nullptr; 1159 } 1160 1161 // copy hashtables 1162 auto *data = new AliasData( 1163 std::move(languageMap), 1164 std::move(scriptMap), 1165 std::move(territoryMap), 1166 std::move(variantMap), 1167 std::move(subdivisionMap), 1168 strings.orphanCharStrings()); 1169 1170 if (data == nullptr) { 1171 status = U_MEMORY_ALLOCATION_ERROR; 1172 } 1173 return data; 1174 } 1175 1176 /** 1177 * A class that find the replacement values of locale fields by using AliasData. 1178 */ 1179 class AliasReplacer { 1180 public: 1181 AliasReplacer(UErrorCode& status) : 1182 language(nullptr), script(nullptr), region(nullptr), 1183 extensions(nullptr), 1184 // store value in variants only once 1185 variants(nullptr, 1186 ([](UElement e1, UElement e2) -> UBool { 1187 return 0==uprv_strcmp((const char*)e1.pointer, 1188 (const char*)e2.pointer);}), 1189 status), 1190 data(nullptr) { 1191 } 1192 ~AliasReplacer() { 1193 } 1194 1195 // Check the fields inside locale, if need to replace fields, 1196 // place the replaced locale ID in out and return true. 1197 // Otherwise return false for no replacement or error. 1198 bool replace( 1199 const Locale& locale, CharString& out, UErrorCode& status); 1200 1201 private: 1202 const char* language; 1203 const char* script; 1204 const char* region; 1205 const char* extensions; 1206 UVector variants; 1207 1208 const AliasData* data; 1209 1210 inline bool notEmpty(const char* str) { 1211 return str && str[0] != NULL_CHAR; 1212 } 1213 1214 /** 1215 * If replacement is neither null nor empty and input is either null or empty, 1216 * return replacement. 1217 * If replacement is neither null nor empty but input is not empty, return input. 1218 * If replacement is either null or empty and type is either null or empty, 1219 * return input. 1220 * Otherwise return null. 1221 * replacement input type return 1222 * AAA nullptr * AAA 1223 * AAA BBB * BBB 1224 * nullptr || "" CCC nullptr CCC 1225 * nullptr || "" * DDD nullptr 1226 */ 1227 inline const char* deleteOrReplace( 1228 const char* input, const char* type, const char* replacement) { 1229 return notEmpty(replacement) ? 1230 ((input == nullptr) ? replacement : input) : 1231 ((type == nullptr) ? input : nullptr); 1232 } 1233 1234 inline bool same(const char* a, const char* b) { 1235 if (a == nullptr && b == nullptr) { 1236 return true; 1237 } 1238 if ((a == nullptr && b != nullptr) || 1239 (a != nullptr && b == nullptr)) { 1240 return false; 1241 } 1242 return uprv_strcmp(a, b) == 0; 1243 } 1244 1245 // Gather fields and generate locale ID into out. 1246 CharString& outputToString(CharString& out, UErrorCode& status); 1247 1248 // Generate the lookup key. 1249 CharString& generateKey(const char* language, const char* region, 1250 const char* variant, CharString& out, 1251 UErrorCode& status); 1252 1253 void parseLanguageReplacement(const char* replacement, 1254 const char*& replaceLanguage, 1255 const char*& replaceScript, 1256 const char*& replaceRegion, 1257 const char*& replaceVariant, 1258 const char*& replaceExtensions, 1259 UVector& toBeFreed, 1260 UErrorCode& status); 1261 1262 // Replace by using languageAlias. 1263 bool replaceLanguage(bool checkLanguage, bool checkRegion, 1264 bool checkVariants, UVector& toBeFreed, 1265 UErrorCode& status); 1266 1267 // Replace by using territoryAlias. 1268 bool replaceTerritory(UVector& toBeFreed, UErrorCode& status); 1269 1270 // Replace by using scriptAlias. 1271 bool replaceScript(UErrorCode& status); 1272 1273 // Replace by using variantAlias. 1274 bool replaceVariant(UErrorCode& status); 1275 1276 // Replace by using subdivisionAlias. 1277 bool replaceSubdivision(StringPiece subdivision, 1278 CharString& output, UErrorCode& status); 1279 1280 // Replace transformed extensions. 1281 bool replaceTransformedExtensions( 1282 CharString& transformedExtensions, CharString& output, UErrorCode& status); 1283 }; 1284 1285 CharString& 1286 AliasReplacer::generateKey( 1287 const char* language, const char* region, const char* variant, 1288 CharString& out, UErrorCode& status) 1289 { 1290 if (U_FAILURE(status)) { return out; } 1291 out.append(language, status); 1292 if (notEmpty(region)) { 1293 out.append(SEP_CHAR, status) 1294 .append(region, status); 1295 } 1296 if (notEmpty(variant)) { 1297 out.append(SEP_CHAR, status) 1298 .append(variant, status); 1299 } 1300 return out; 1301 } 1302 1303 void 1304 AliasReplacer::parseLanguageReplacement( 1305 const char* replacement, 1306 const char*& replacedLanguage, 1307 const char*& replacedScript, 1308 const char*& replacedRegion, 1309 const char*& replacedVariant, 1310 const char*& replacedExtensions, 1311 UVector& toBeFreed, 1312 UErrorCode& status) 1313 { 1314 if (U_FAILURE(status)) { 1315 return; 1316 } 1317 replacedScript = replacedRegion = replacedVariant 1318 = replacedExtensions = nullptr; 1319 if (uprv_strchr(replacement, '_') == nullptr) { 1320 replacedLanguage = replacement; 1321 // reach the end, just return it. 1322 return; 1323 } 1324 // We have multiple field so we have to allocate and parse 1325 CharString* str = 1326 new CharString(replacement, static_cast<int32_t>(uprv_strlen(replacement)), status); 1327 LocalPointer<CharString> lpStr(str, status); 1328 toBeFreed.adoptElement(lpStr.orphan(), status); 1329 if (U_FAILURE(status)) { 1330 return; 1331 } 1332 char* data = str->data(); 1333 replacedLanguage = (const char*) data; 1334 char* endOfField = uprv_strchr(data, '_'); 1335 *endOfField = '\0'; // null terminiate it. 1336 endOfField++; 1337 const char* start = endOfField; 1338 endOfField = const_cast<char*>(uprv_strchr(start, '_')); 1339 size_t len = 0; 1340 if (endOfField == nullptr) { 1341 len = uprv_strlen(start); 1342 } else { 1343 len = endOfField - start; 1344 *endOfField = '\0'; // null terminiate it. 1345 } 1346 if (len == 4 && uprv_isASCIILetter(*start)) { 1347 // Got a script 1348 replacedScript = start; 1349 if (endOfField == nullptr) { 1350 return; 1351 } 1352 start = endOfField++; 1353 endOfField = const_cast<char*>(uprv_strchr(start, '_')); 1354 if (endOfField == nullptr) { 1355 len = uprv_strlen(start); 1356 } else { 1357 len = endOfField - start; 1358 *endOfField = '\0'; // null terminiate it. 1359 } 1360 } 1361 if (len >= 2 && len <= 3) { 1362 // Got a region 1363 replacedRegion = start; 1364 if (endOfField == nullptr) { 1365 return; 1366 } 1367 start = endOfField++; 1368 endOfField = const_cast<char*>(uprv_strchr(start, '_')); 1369 if (endOfField == nullptr) { 1370 len = uprv_strlen(start); 1371 } else { 1372 len = endOfField - start; 1373 *endOfField = '\0'; // null terminiate it. 1374 } 1375 } 1376 if (len >= 4) { 1377 // Got a variant 1378 replacedVariant = start; 1379 if (endOfField == nullptr) { 1380 return; 1381 } 1382 start = endOfField++; 1383 } 1384 replacedExtensions = start; 1385 } 1386 1387 bool 1388 AliasReplacer::replaceLanguage( 1389 bool checkLanguage, bool checkRegion, 1390 bool checkVariants, UVector& toBeFreed, UErrorCode& status) 1391 { 1392 if (U_FAILURE(status)) { 1393 return false; 1394 } 1395 if ( (checkRegion && region == nullptr) || 1396 (checkVariants && variants.size() == 0)) { 1397 // Nothing to search. 1398 return false; 1399 } 1400 int32_t variant_size = checkVariants ? variants.size() : 1; 1401 // Since we may have more than one variant, we need to loop through them. 1402 const char* searchLanguage = checkLanguage ? language : "und"; 1403 const char* searchRegion = checkRegion ? region : nullptr; 1404 const char* searchVariant = nullptr; 1405 for (int32_t variant_index = 0; 1406 variant_index < variant_size; 1407 variant_index++) { 1408 if (checkVariants) { 1409 U_ASSERT(variant_index < variant_size); 1410 searchVariant = static_cast<const char*>(variants.elementAt(variant_index)); 1411 } 1412 1413 if (searchVariant != nullptr && uprv_strlen(searchVariant) < 4) { 1414 // Do not consider ill-formed variant subtag. 1415 searchVariant = nullptr; 1416 } 1417 CharString typeKey; 1418 generateKey(searchLanguage, searchRegion, searchVariant, typeKey, 1419 status); 1420 if (U_FAILURE(status)) { 1421 return false; 1422 } 1423 const char *replacement = data->languageMap().get(typeKey.data()); 1424 if (replacement == nullptr) { 1425 // Found no replacement data. 1426 continue; 1427 } 1428 1429 const char* replacedLanguage = nullptr; 1430 const char* replacedScript = nullptr; 1431 const char* replacedRegion = nullptr; 1432 const char* replacedVariant = nullptr; 1433 const char* replacedExtensions = nullptr; 1434 parseLanguageReplacement(replacement, 1435 replacedLanguage, 1436 replacedScript, 1437 replacedRegion, 1438 replacedVariant, 1439 replacedExtensions, 1440 toBeFreed, 1441 status); 1442 replacedLanguage = 1443 (replacedLanguage != nullptr && uprv_strcmp(replacedLanguage, "und") == 0) ? 1444 language : replacedLanguage; 1445 replacedScript = deleteOrReplace(script, nullptr, replacedScript); 1446 replacedRegion = deleteOrReplace(region, searchRegion, replacedRegion); 1447 replacedVariant = deleteOrReplace( 1448 searchVariant, searchVariant, replacedVariant); 1449 1450 if ( same(language, replacedLanguage) && 1451 same(script, replacedScript) && 1452 same(region, replacedRegion) && 1453 same(searchVariant, replacedVariant) && 1454 replacedExtensions == nullptr) { 1455 // Replacement produce no changes. 1456 continue; 1457 } 1458 1459 language = replacedLanguage; 1460 region = replacedRegion; 1461 script = replacedScript; 1462 if (searchVariant != nullptr) { 1463 if (notEmpty(replacedVariant)) { 1464 variants.setElementAt((void*)replacedVariant, variant_index); 1465 } else { 1466 variants.removeElementAt(variant_index); 1467 } 1468 } 1469 if (replacedExtensions != nullptr) { 1470 // DO NOTHING 1471 // UTS35 does not specify what should we do if we have extensions in the 1472 // replacement. Currently we know only the following 4 "BCP47 LegacyRules" have 1473 // extensions in them languageAlias: 1474 // i_default => en_x_i_default 1475 // i_enochian => und_x_i_enochian 1476 // i_mingo => see_x_i_mingo 1477 // zh_min => nan_x_zh_min 1478 // But all of them are already changed by code inside ultag_parse() before 1479 // hitting this code. 1480 } 1481 1482 // Something changed by language alias data. 1483 return true; 1484 } 1485 // Nothing changed by language alias data. 1486 return false; 1487 } 1488 1489 bool 1490 AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status) 1491 { 1492 if (U_FAILURE(status)) { 1493 return false; 1494 } 1495 if (region == nullptr) { 1496 // No region to search. 1497 return false; 1498 } 1499 const char *replacement = data->territoryMap().get(region); 1500 if (replacement == nullptr) { 1501 // Found no replacement data for this region. 1502 return false; 1503 } 1504 const char* replacedRegion = replacement; 1505 const char* firstSpace = uprv_strchr(replacement, ' '); 1506 if (firstSpace != nullptr) { 1507 // If there are are more than one region in the replacement. 1508 // We need to check which one match based on the language. 1509 // Cannot use nullptr for language because that will construct 1510 // the default locale, in that case, use "und" to get the correct 1511 // locale. 1512 Locale l = LocaleBuilder() 1513 .setLanguage(language == nullptr ? "und" : language) 1514 .setScript(script) 1515 .build(status); 1516 l.addLikelySubtags(status); 1517 const char* likelyRegion = l.getCountry(); 1518 LocalPointer<CharString> item; 1519 if (likelyRegion != nullptr && uprv_strlen(likelyRegion) > 0) { 1520 size_t len = uprv_strlen(likelyRegion); 1521 const char* foundInReplacement = uprv_strstr(replacement, 1522 likelyRegion); 1523 if (foundInReplacement != nullptr) { 1524 // Assuming the case there are no three letter region code in 1525 // the replacement of territoryAlias 1526 U_ASSERT(foundInReplacement == replacement || 1527 *(foundInReplacement-1) == ' '); 1528 U_ASSERT(foundInReplacement[len] == ' ' || 1529 foundInReplacement[len] == '\0'); 1530 item.adoptInsteadAndCheckErrorCode( 1531 new CharString(foundInReplacement, static_cast<int32_t>(len), status), status); 1532 } 1533 } 1534 if (item.isNull() && U_SUCCESS(status)) { 1535 item.adoptInsteadAndCheckErrorCode( 1536 new CharString(replacement, 1537 static_cast<int32_t>(firstSpace - replacement), status), status); 1538 } 1539 if (U_FAILURE(status)) { return false; } 1540 replacedRegion = item->data(); 1541 toBeFreed.adoptElement(item.orphan(), status); 1542 if (U_FAILURE(status)) { return false; } 1543 } 1544 U_ASSERT(!same(region, replacedRegion)); 1545 region = replacedRegion; 1546 // The region is changed by data in territory alias. 1547 return true; 1548 } 1549 1550 bool 1551 AliasReplacer::replaceScript(UErrorCode& status) 1552 { 1553 if (U_FAILURE(status)) { 1554 return false; 1555 } 1556 if (script == nullptr) { 1557 // No script to search. 1558 return false; 1559 } 1560 const char *replacement = data->scriptMap().get(script); 1561 if (replacement == nullptr) { 1562 // Found no replacement data for this script. 1563 return false; 1564 } 1565 U_ASSERT(!same(script, replacement)); 1566 script = replacement; 1567 // The script is changed by data in script alias. 1568 return true; 1569 } 1570 1571 bool 1572 AliasReplacer::replaceVariant(UErrorCode& status) 1573 { 1574 if (U_FAILURE(status)) { 1575 return false; 1576 } 1577 // Since we may have more than one variant, we need to loop through them. 1578 for (int32_t i = 0; i < variants.size(); i++) { 1579 const char* variant = static_cast<const char*>(variants.elementAt(i)); 1580 const char *replacement = data->variantMap().get(variant); 1581 if (replacement == nullptr) { 1582 // Found no replacement data for this variant. 1583 continue; 1584 } 1585 U_ASSERT((uprv_strlen(replacement) >= 5 && 1586 uprv_strlen(replacement) <= 8) || 1587 (uprv_strlen(replacement) == 4 && 1588 replacement[0] >= '0' && 1589 replacement[0] <= '9')); 1590 if (!same(variant, replacement)) { 1591 variants.setElementAt((void*)replacement, i); 1592 // Special hack to handle hepburn-heploc => alalc97 1593 if (uprv_strcmp(variant, "heploc") == 0) { 1594 for (int32_t j = 0; j < variants.size(); j++) { 1595 if (uprv_strcmp((const char*)(variants.elementAt(j)), 1596 "hepburn") == 0) { 1597 variants.removeElementAt(j); 1598 } 1599 } 1600 } 1601 return true; 1602 } 1603 } 1604 return false; 1605 } 1606 1607 bool 1608 AliasReplacer::replaceSubdivision( 1609 StringPiece subdivision, CharString& output, UErrorCode& status) 1610 { 1611 if (U_FAILURE(status)) { 1612 return false; 1613 } 1614 const char *replacement = data->subdivisionMap().get(subdivision.data()); 1615 if (replacement != nullptr) { 1616 const char* firstSpace = uprv_strchr(replacement, ' '); 1617 // Found replacement data for this subdivision. 1618 size_t len = (firstSpace != nullptr) ? 1619 (firstSpace - replacement) : uprv_strlen(replacement); 1620 if (2 <= len && len <= 8) { 1621 output.append(replacement, static_cast<int32_t>(len), status); 1622 if (2 == len) { 1623 // Add 'zzzz' based on changes to UTS #35 for CLDR-14312. 1624 output.append("zzzz", 4, status); 1625 } 1626 } 1627 return true; 1628 } 1629 return false; 1630 } 1631 1632 bool 1633 AliasReplacer::replaceTransformedExtensions( 1634 CharString& transformedExtensions, CharString& output, UErrorCode& status) 1635 { 1636 // The content of the transformedExtensions will be modified in this 1637 // function to NUL-terminating (tkey-tvalue) pairs. 1638 if (U_FAILURE(status)) { 1639 return false; 1640 } 1641 int32_t len = transformedExtensions.length(); 1642 const char* str = transformedExtensions.data(); 1643 const char* tkey = ultag_getTKeyStart(str); 1644 int32_t tlangLen = (tkey == str) ? 0 : 1645 ((tkey == nullptr) ? len : static_cast<int32_t>((tkey - str - 1))); 1646 if (tlangLen > 0) { 1647 Locale tlang = LocaleBuilder() 1648 .setLanguageTag(StringPiece(str, tlangLen)) 1649 .build(status); 1650 tlang.canonicalize(status); 1651 output = tlang.toLanguageTag<CharString>(status); 1652 if (U_FAILURE(status)) { 1653 return false; 1654 } 1655 T_CString_toLowerCase(output.data()); 1656 } 1657 if (tkey != nullptr) { 1658 // We need to sort the tfields by tkey 1659 UVector tfields(status); 1660 if (U_FAILURE(status)) { 1661 return false; 1662 } 1663 do { 1664 const char* tvalue = uprv_strchr(tkey, '-'); 1665 if (tvalue == nullptr) { 1666 status = U_ILLEGAL_ARGUMENT_ERROR; 1667 return false; 1668 } 1669 const char* nextTKey = ultag_getTKeyStart(tvalue); 1670 if (nextTKey != nullptr) { 1671 *const_cast<char*>(nextTKey - 1) = '\0'; // NUL terminate tvalue 1672 } 1673 tfields.insertElementAt((void*)tkey, tfields.size(), status); 1674 if (U_FAILURE(status)) { 1675 return false; 1676 } 1677 tkey = nextTKey; 1678 } while (tkey != nullptr); 1679 tfields.sort([](UElement e1, UElement e2) -> int32_t { 1680 return uprv_strcmp((const char*)e1.pointer, (const char*)e2.pointer); 1681 }, status); 1682 for (int32_t i = 0; i < tfields.size(); i++) { 1683 if (output.length() > 0) { 1684 output.append('-', status); 1685 } 1686 const char* tfield = static_cast<const char*>(tfields.elementAt(i)); 1687 const char* tvalue = uprv_strchr(tfield, '-'); 1688 if (tvalue == nullptr) { 1689 status = U_ILLEGAL_ARGUMENT_ERROR; 1690 return false; 1691 } 1692 // Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue. 1693 *const_cast<char*>(tvalue++) = '\0'; // NUL terminate tkey 1694 output.append(tfield, status).append('-', status); 1695 std::optional<std::string_view> bcpTValue = ulocimp_toBcpType(tfield, tvalue); 1696 output.append(bcpTValue.has_value() ? *bcpTValue : tvalue, status); 1697 } 1698 } 1699 if (U_FAILURE(status)) { 1700 return false; 1701 } 1702 return true; 1703 } 1704 1705 CharString& 1706 AliasReplacer::outputToString( 1707 CharString& out, UErrorCode& status) 1708 { 1709 if (U_FAILURE(status)) { return out; } 1710 out.append(language, status); 1711 if (notEmpty(script)) { 1712 out.append(SEP_CHAR, status) 1713 .append(script, status); 1714 } 1715 if (notEmpty(region)) { 1716 out.append(SEP_CHAR, status) 1717 .append(region, status); 1718 } 1719 if (variants.size() > 0) { 1720 if (!notEmpty(script) && !notEmpty(region)) { 1721 out.append(SEP_CHAR, status); 1722 } 1723 variants.sort([](UElement e1, UElement e2) -> int32_t { 1724 return uprv_strcmp((const char*)e1.pointer, (const char*)e2.pointer); 1725 }, status); 1726 int32_t variantsStart = out.length(); 1727 for (int32_t i = 0; i < variants.size(); i++) { 1728 out.append(SEP_CHAR, status) 1729 .append(static_cast<const char*>(variants.elementAt(i)), 1730 status); 1731 } 1732 T_CString_toUpperCase(out.data() + variantsStart); 1733 } 1734 if (notEmpty(extensions)) { 1735 CharString tmp("und_", status); 1736 tmp.append(extensions, status); 1737 Locale tmpLocale(tmp.data()); 1738 // only support x extension inside CLDR for now. 1739 U_ASSERT(extensions[0] == 'x'); 1740 out.append(tmpLocale.getName() + 1, status); 1741 } 1742 return out; 1743 } 1744 1745 bool 1746 AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status) 1747 { 1748 data = AliasData::singleton(status); 1749 if (U_FAILURE(status)) { 1750 return false; 1751 } 1752 U_ASSERT(data != nullptr); 1753 out.clear(); 1754 language = locale.getLanguage(); 1755 if (!notEmpty(language)) { 1756 language = nullptr; 1757 } 1758 script = locale.getScript(); 1759 if (!notEmpty(script)) { 1760 script = nullptr; 1761 } 1762 region = locale.getCountry(); 1763 if (!notEmpty(region)) { 1764 region = nullptr; 1765 } 1766 const char* variantsStr = locale.getVariant(); 1767 CharString variantsBuff(variantsStr, -1, status); 1768 if (!variantsBuff.isEmpty()) { 1769 if (U_FAILURE(status)) { return false; } 1770 char* start = variantsBuff.data(); 1771 T_CString_toLowerCase(start); 1772 char* end; 1773 while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr && 1774 U_SUCCESS(status)) { 1775 *end = NULL_CHAR; // null terminate inside variantsBuff 1776 // do not add "" or duplicate data to variants 1777 if (*start && !variants.contains(start)) { 1778 variants.addElement(start, status); 1779 } 1780 start = end + 1; 1781 } 1782 // do not add "" or duplicate data to variants 1783 if (*start && !variants.contains(start)) { 1784 variants.addElement(start, status); 1785 } 1786 } 1787 if (U_FAILURE(status)) { return false; } 1788 1789 // Sort the variants 1790 variants.sort([](UElement e1, UElement e2) -> int32_t { 1791 return uprv_strcmp((const char*)e1.pointer, (const char*)e2.pointer); 1792 }, status); 1793 1794 // A changed count to assert when loop too many times. 1795 int changed = 0; 1796 // A UVector to to hold CharString allocated by the replace* method 1797 // and freed when out of scope from his function. 1798 UVector stringsToBeFreed([](void *obj) { delete static_cast<CharString*>(obj); }, 1799 nullptr, 10, status); 1800 while (U_SUCCESS(status)) { 1801 // Something wrong with the data cause looping here more than 10 times 1802 // already. 1803 U_ASSERT(changed < 5); 1804 // From observation of key in data/misc/metadata.txt 1805 // we know currently we only need to search in the following combination 1806 // of fields for type in languageAlias: 1807 // * lang_region_variant 1808 // * lang_region 1809 // * lang_variant 1810 // * lang 1811 // * und_variant 1812 // This assumption is ensured by the U_ASSERT in readLanguageAlias 1813 // 1814 // lang REGION variant 1815 if ( replaceLanguage(true, true, true, stringsToBeFreed, status) || 1816 replaceLanguage(true, true, false, stringsToBeFreed, status) || 1817 replaceLanguage(true, false, true, stringsToBeFreed, status) || 1818 replaceLanguage(true, false, false, stringsToBeFreed, status) || 1819 replaceLanguage(false,false, true, stringsToBeFreed, status) || 1820 replaceTerritory(stringsToBeFreed, status) || 1821 replaceScript(status) || 1822 replaceVariant(status)) { 1823 // Some values in data is changed, try to match from the beginning 1824 // again. 1825 changed++; 1826 continue; 1827 } 1828 // Nothing changed. Break out. 1829 break; 1830 } // while(1) 1831 1832 if (U_FAILURE(status)) { return false; } 1833 // Nothing changed and we know the order of the variants are not change 1834 // because we have no variant or only one. 1835 const char* extensionsStr = locale_getKeywordsStart(locale.getName()); 1836 if (changed == 0 && variants.size() <= 1 && extensionsStr == nullptr) { 1837 return false; 1838 } 1839 outputToString(out, status); 1840 if (U_FAILURE(status)) { 1841 return false; 1842 } 1843 if (extensionsStr != nullptr) { 1844 changed = 0; 1845 Locale temp(locale); 1846 LocalPointer<icu::StringEnumeration> iter(locale.createKeywords(status)); 1847 if (U_SUCCESS(status) && !iter.isNull()) { 1848 const char* key; 1849 while ((key = iter->next(nullptr, status)) != nullptr) { 1850 if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 || 1851 uprv_strcmp("t", key) == 0) { 1852 auto value = locale.getKeywordValue<CharString>(key, status); 1853 if (U_FAILURE(status)) { 1854 status = U_ZERO_ERROR; 1855 continue; 1856 } 1857 CharString replacement; 1858 if (uprv_strlen(key) == 2) { 1859 if (replaceSubdivision(value.toStringPiece(), replacement, status)) { 1860 changed++; 1861 temp.setKeywordValue(key, replacement.data(), status); 1862 } 1863 } else { 1864 U_ASSERT(uprv_strcmp(key, "t") == 0); 1865 if (replaceTransformedExtensions(value, replacement, status)) { 1866 changed++; 1867 temp.setKeywordValue(key, replacement.data(), status); 1868 } 1869 } 1870 if (U_FAILURE(status)) { 1871 return false; 1872 } 1873 } 1874 } 1875 } 1876 if (changed != 0) { 1877 extensionsStr = locale_getKeywordsStart(temp.getName()); 1878 } 1879 out.append(extensionsStr, status); 1880 } 1881 if (U_FAILURE(status)) { 1882 return false; 1883 } 1884 // If the tag is not changed, return. 1885 if (uprv_strcmp(out.data(), locale.getName()) == 0) { 1886 out.clear(); 1887 return false; 1888 } 1889 return true; 1890 } 1891 1892 // Return true if the locale is changed during canonicalization. 1893 // The replaced value then will be put into out. 1894 bool 1895 canonicalizeLocale(const Locale& locale, CharString& out, UErrorCode& status) 1896 { 1897 if (U_FAILURE(status)) { return false; } 1898 AliasReplacer replacer(status); 1899 return replacer.replace(locale, out, status); 1900 } 1901 1902 // Function to optimize for known cases without so we can skip the loading 1903 // of resources in the startup time until we really need it. 1904 bool 1905 isKnownCanonicalizedLocale(const char* locale, UErrorCode& status) 1906 { 1907 if (U_FAILURE(status)) { return false; } 1908 1909 if ( uprv_strcmp(locale, "c") == 0 || 1910 uprv_strcmp(locale, "en") == 0 || 1911 uprv_strcmp(locale, "en_US") == 0) { 1912 return true; 1913 } 1914 1915 // common well-known Canonicalized. 1916 umtx_initOnce(gKnownCanonicalizedInitOnce, 1917 &loadKnownCanonicalized, status); 1918 if (U_FAILURE(status)) { 1919 return false; 1920 } 1921 U_ASSERT(gKnownCanonicalized != nullptr); 1922 return uhash_geti(gKnownCanonicalized, locale) != 0; 1923 } 1924 1925 } // namespace 1926 1927 U_NAMESPACE_END 1928 1929 // Function for testing. 1930 U_EXPORT const char* const* 1931 ulocimp_getKnownCanonicalizedLocaleForTest(int32_t& length) 1932 { 1933 U_NAMESPACE_USE 1934 length = UPRV_LENGTHOF(KNOWN_CANONICALIZED); 1935 return KNOWN_CANONICALIZED; 1936 } 1937 1938 // Function for testing. 1939 U_EXPORT bool 1940 ulocimp_isCanonicalizedLocaleForTest(const char* localeName) 1941 { 1942 U_NAMESPACE_USE 1943 Locale l(localeName); 1944 UErrorCode status = U_ZERO_ERROR; 1945 CharString temp; 1946 return !canonicalizeLocale(l, temp, status) && U_SUCCESS(status); 1947 } 1948 1949 U_NAMESPACE_BEGIN 1950 1951 Locale& Locale::init(const char* localeID, UBool canonicalize) 1952 { 1953 return localeID == nullptr ? *this = getDefault() : init(StringPiece{localeID}, canonicalize); 1954 } 1955 1956 /*This function initializes a Locale from a C locale ID*/ 1957 Locale& Locale::init(StringPiece localeID, UBool canonicalize) 1958 { 1959 /* Free our current storage */ 1960 Nest& nest = payload.emplace<Nest>(); 1961 1962 // not a loop: 1963 // just an easy way to have a common error-exit 1964 // without goto and without another function 1965 do { 1966 char *separator; 1967 char *field[5] = {nullptr}; 1968 int32_t fieldLen[5] = {0}; 1969 int32_t fieldIdx; 1970 int32_t variantField; 1971 int32_t length; 1972 UErrorCode err; 1973 1974 const auto parse = [canonicalize](std::string_view localeID, 1975 char* name, 1976 int32_t nameCapacity, 1977 UErrorCode& status) { 1978 return ByteSinkUtil::viaByteSinkToTerminatedChars( 1979 name, nameCapacity, 1980 [&](ByteSink& sink, UErrorCode& status) { 1981 if (canonicalize) { 1982 ulocimp_canonicalize(localeID, sink, status); 1983 } else { 1984 ulocimp_getName(localeID, sink, status); 1985 } 1986 }, 1987 status); 1988 }; 1989 1990 // "canonicalize" the locale ID to ICU/Java format 1991 char* fullName = nest.baseName; 1992 err = U_ZERO_ERROR; 1993 length = parse(localeID, fullName, sizeof Nest::baseName, err); 1994 1995 FixedString fullNameBuffer; 1996 if (err == U_BUFFER_OVERFLOW_ERROR || length >= static_cast<int32_t>(sizeof Nest::baseName)) { 1997 /*Go to heap for the fullName if necessary*/ 1998 if (!fullNameBuffer.reserve(length + 1)) { 1999 break; // error: out of memory 2000 } 2001 fullName = fullNameBuffer.getAlias(); 2002 err = U_ZERO_ERROR; 2003 length = parse(localeID, fullName, length + 1, err); 2004 } 2005 if(U_FAILURE(err) || err == U_STRING_NOT_TERMINATED_WARNING) { 2006 /* should never occur */ 2007 break; 2008 } 2009 2010 std::string_view language; 2011 std::string_view script; 2012 std::string_view region; 2013 int32_t variantBegin = length; 2014 2015 /* after uloc_getName/canonicalize() we know that only '_' are separators */ 2016 /* But _ could also appeared in timezone such as "en@timezone=America/Los_Angeles" */ 2017 separator = field[0] = fullName; 2018 fieldIdx = 1; 2019 char* at = uprv_strchr(fullName, '@'); 2020 while ((separator = uprv_strchr(field[fieldIdx-1], SEP_CHAR)) != nullptr && 2021 fieldIdx < UPRV_LENGTHOF(field)-1 && 2022 (at == nullptr || separator < at)) { 2023 field[fieldIdx] = separator + 1; 2024 fieldLen[fieldIdx - 1] = static_cast<int32_t>(separator - field[fieldIdx - 1]); 2025 fieldIdx++; 2026 } 2027 // variant may contain @foo or .foo POSIX cruft; remove it 2028 separator = uprv_strchr(field[fieldIdx-1], '@'); 2029 char* sep2 = uprv_strchr(field[fieldIdx-1], '.'); 2030 if (separator!=nullptr || sep2!=nullptr) { 2031 if (separator==nullptr || (sep2!=nullptr && separator > sep2)) { 2032 separator = sep2; 2033 } 2034 fieldLen[fieldIdx - 1] = static_cast<int32_t>(separator - field[fieldIdx - 1]); 2035 } else { 2036 fieldLen[fieldIdx - 1] = length - static_cast<int32_t>(field[fieldIdx - 1] - fullName); 2037 } 2038 bool hasKeywords = at != nullptr && uprv_strchr(at + 1, '=') != nullptr; 2039 2040 if (fieldLen[0] >= ULOC_LANG_CAPACITY) 2041 { 2042 break; // error: the language field is too long 2043 } 2044 2045 variantField = 1; /* Usually the 2nd one, except when a script or country is also used. */ 2046 if (fieldLen[0] > 0) { 2047 /* We have a language */ 2048 language = {fullName, static_cast<std::string_view::size_type>(fieldLen[0])}; 2049 } 2050 if (fieldLen[1] == 4 && uprv_isASCIILetter(field[1][0]) && 2051 uprv_isASCIILetter(field[1][1]) && uprv_isASCIILetter(field[1][2]) && 2052 uprv_isASCIILetter(field[1][3])) { 2053 /* We have at least a script */ 2054 script = {field[1], static_cast<std::string_view::size_type>(fieldLen[1])}; 2055 variantField++; 2056 } 2057 2058 if (fieldLen[variantField] == 2 || fieldLen[variantField] == 3) { 2059 /* We have a country */ 2060 region = {field[variantField], static_cast<std::string_view::size_type>(fieldLen[variantField])}; 2061 variantField++; 2062 } else if (fieldLen[variantField] == 0) { 2063 variantField++; /* script or country empty but variant in next field (i.e. en__POSIX) */ 2064 } 2065 2066 if (fieldLen[variantField] > 0) { 2067 /* We have a variant */ 2068 variantBegin = static_cast<int32_t>(field[variantField] - fullName); 2069 } else if (hasKeywords) { 2070 // The original computation of variantBegin leaves it equal to the length 2071 // of fullName if there is no variant. It should instead be 2072 // the length of the baseName. 2073 variantBegin = static_cast<int32_t>(at - fullName); 2074 } 2075 2076 if (!hasKeywords && Nest::fits(length, language, script, region)) { 2077 U_ASSERT(fullName == nest.baseName); 2078 U_ASSERT(fullNameBuffer.isEmpty()); 2079 nest.init(language, script, region, variantBegin); 2080 } else { 2081 if (fullName == nest.baseName) { 2082 U_ASSERT(fullNameBuffer.isEmpty()); 2083 fullNameBuffer = {fullName, static_cast<std::string_view::size_type>(length)}; 2084 if (fullNameBuffer.isEmpty()) { 2085 break; // error: out of memory 2086 } 2087 if (!language.empty()) { 2088 language = {fullNameBuffer.data(), language.size()}; 2089 } 2090 if (!script.empty()) { 2091 script = {fullNameBuffer.data() + (script.data() - fullName), script.size()}; 2092 } 2093 if (!region.empty()) { 2094 region = {fullNameBuffer.data() + (region.data() - fullName), region.size()}; 2095 } 2096 } 2097 Heap& heap = payload.emplace<Heap>(language, script, region, variantBegin); 2098 if (isBogus()) { 2099 break; // error: out of memory 2100 } 2101 U_ASSERT(!fullNameBuffer.isEmpty()); 2102 heap.ptr->fullName = std::move(fullNameBuffer); 2103 if (hasKeywords) { 2104 if (std::string_view::size_type baseNameLength = at - fullName; baseNameLength > 0) { 2105 heap.ptr->baseName = {heap.ptr->fullName.data(), baseNameLength}; 2106 if (heap.ptr->baseName.isEmpty()) { 2107 break; // error: out of memory 2108 } 2109 } 2110 } 2111 } 2112 2113 if (canonicalize) { 2114 if (!isKnownCanonicalizedLocale(getName(), err)) { 2115 CharString replaced; 2116 // Not sure it is already canonicalized 2117 if (canonicalizeLocale(*this, replaced, err)) { 2118 U_ASSERT(U_SUCCESS(err)); 2119 // If need replacement, call init again. 2120 init(replaced.data(), false); 2121 } 2122 if (U_FAILURE(err)) { 2123 break; 2124 } 2125 } 2126 } // if (canonicalize) { 2127 2128 // successful end of init() 2129 return *this; 2130 } while(0); /*loop doesn't iterate*/ 2131 2132 // when an error occurs, then set this object to "bogus" (there is no UErrorCode here) 2133 setToBogus(); 2134 2135 return *this; 2136 } 2137 2138 int32_t 2139 Locale::hashCode() const 2140 { 2141 return ustr_hashCharsN(getName(), static_cast<int32_t>(uprv_strlen(getName()))); 2142 } 2143 2144 void 2145 Locale::setToBogus() { 2146 /* Free our current storage */ 2147 payload.setToBogus(); 2148 } 2149 2150 const Locale& U_EXPORT2 2151 Locale::getDefault() 2152 { 2153 { 2154 Mutex lock(&gDefaultLocaleMutex); 2155 if (gDefaultLocale != nullptr) { 2156 return *gDefaultLocale; 2157 } 2158 } 2159 UErrorCode status = U_ZERO_ERROR; 2160 return *locale_set_default_internal(nullptr, status); 2161 } 2162 2163 2164 2165 void U_EXPORT2 2166 Locale::setDefault( const Locale& newLocale, 2167 UErrorCode& status) 2168 { 2169 if (U_FAILURE(status)) { 2170 return; 2171 } 2172 2173 /* Set the default from the full name string of the supplied locale. 2174 * This is a convenient way to access the default locale caching mechanisms. 2175 */ 2176 const char *localeID = newLocale.getName(); 2177 locale_set_default_internal(localeID, status); 2178 } 2179 2180 void 2181 Locale::addLikelySubtags(UErrorCode& status) { 2182 if (U_FAILURE(status)) { 2183 return; 2184 } 2185 2186 CharString maximizedLocaleID = ulocimp_addLikelySubtags(getName(), status); 2187 2188 if (U_FAILURE(status)) { 2189 if (status == U_MEMORY_ALLOCATION_ERROR) { 2190 setToBogus(); 2191 } 2192 return; 2193 } 2194 2195 init(maximizedLocaleID.data(), /*canonicalize=*/false); 2196 if (isBogus()) { 2197 status = U_ILLEGAL_ARGUMENT_ERROR; 2198 } 2199 } 2200 2201 void 2202 Locale::minimizeSubtags(UErrorCode& status) { 2203 Locale::minimizeSubtags(false, status); 2204 } 2205 void 2206 Locale::minimizeSubtags(bool favorScript, UErrorCode& status) { 2207 if (U_FAILURE(status)) { 2208 return; 2209 } 2210 2211 CharString minimizedLocaleID = ulocimp_minimizeSubtags(getName(), favorScript, status); 2212 2213 if (U_FAILURE(status)) { 2214 if (status == U_MEMORY_ALLOCATION_ERROR) { 2215 setToBogus(); 2216 } 2217 return; 2218 } 2219 2220 init(minimizedLocaleID.data(), /*canonicalize=*/false); 2221 if (isBogus()) { 2222 status = U_ILLEGAL_ARGUMENT_ERROR; 2223 } 2224 } 2225 2226 void 2227 Locale::canonicalize(UErrorCode& status) { 2228 if (U_FAILURE(status)) { 2229 return; 2230 } 2231 if (isBogus()) { 2232 status = U_ILLEGAL_ARGUMENT_ERROR; 2233 return; 2234 } 2235 CharString uncanonicalized(getName(), status); 2236 if (U_FAILURE(status)) { 2237 if (status == U_MEMORY_ALLOCATION_ERROR) { 2238 setToBogus(); 2239 } 2240 return; 2241 } 2242 init(uncanonicalized.data(), /*canonicalize=*/true); 2243 if (isBogus()) { 2244 status = U_ILLEGAL_ARGUMENT_ERROR; 2245 } 2246 } 2247 2248 Locale U_EXPORT2 2249 Locale::forLanguageTag(StringPiece tag, UErrorCode& status) 2250 { 2251 Locale result(Locale::eBOGUS); 2252 2253 if (U_FAILURE(status)) { 2254 return result; 2255 } 2256 2257 // If a BCP 47 language tag is passed as the language parameter to the 2258 // normal Locale constructor, it will actually fall back to invoking 2259 // uloc_forLanguageTag() to parse it if it somehow is able to detect that 2260 // the string actually is BCP 47. This works well for things like strings 2261 // using BCP 47 extensions, but it does not at all work for things like 2262 // legacy language tags (marked as “Type: grandfathered” in BCP 47, 2263 // e.g., "en-GB-oed") which are possible to also 2264 // interpret as ICU locale IDs and because of that won't trigger the BCP 47 2265 // parsing. Therefore the code here explicitly calls uloc_forLanguageTag() 2266 // and then Locale::init(), instead of just calling the normal constructor. 2267 2268 int32_t parsedLength; 2269 CharString localeID = ulocimp_forLanguageTag( 2270 tag.data(), 2271 tag.length(), 2272 &parsedLength, 2273 status); 2274 2275 if (U_FAILURE(status)) { 2276 return result; 2277 } 2278 2279 if (parsedLength != tag.size()) { 2280 status = U_ILLEGAL_ARGUMENT_ERROR; 2281 return result; 2282 } 2283 2284 result.init(localeID.data(), /*canonicalize=*/false); 2285 if (result.isBogus()) { 2286 status = U_ILLEGAL_ARGUMENT_ERROR; 2287 } 2288 return result; 2289 } 2290 2291 void 2292 Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const 2293 { 2294 if (U_FAILURE(status)) { 2295 return; 2296 } 2297 2298 if (isBogus()) { 2299 status = U_ILLEGAL_ARGUMENT_ERROR; 2300 return; 2301 } 2302 2303 ulocimp_toLanguageTag(getName(), sink, /*strict=*/false, status); 2304 } 2305 2306 Locale U_EXPORT2 2307 Locale::createFromName (const char *name) 2308 { 2309 if (name) { 2310 Locale l(""); 2311 l.init(name, false); 2312 return l; 2313 } 2314 else { 2315 return getDefault(); 2316 } 2317 } 2318 2319 Locale U_EXPORT2 2320 Locale::createFromName(StringPiece name) { 2321 Locale loc(""); 2322 loc.init(name, false); 2323 return loc; 2324 } 2325 2326 Locale U_EXPORT2 2327 Locale::createCanonical(const char* name) { 2328 Locale loc(""); 2329 loc.init(name, true); 2330 return loc; 2331 } 2332 2333 const char * 2334 Locale::getISO3Language() const 2335 { 2336 return uloc_getISO3Language(getName()); 2337 } 2338 2339 2340 const char * 2341 Locale::getISO3Country() const 2342 { 2343 return uloc_getISO3Country(getName()); 2344 } 2345 2346 /** 2347 * Return the LCID value as specified in the "LocaleID" resource for this 2348 * locale. The LocaleID must be expressed as a hexadecimal number, from 2349 * one to four digits. If the LocaleID resource is not present, or is 2350 * in an incorrect format, 0 is returned. The LocaleID is for use in 2351 * Windows (it is an LCID), but is available on all platforms. 2352 */ 2353 uint32_t 2354 Locale::getLCID() const 2355 { 2356 return uloc_getLCID(getName()); 2357 } 2358 2359 const char* const* U_EXPORT2 Locale::getISOCountries() 2360 { 2361 return uloc_getISOCountries(); 2362 } 2363 2364 const char* const* U_EXPORT2 Locale::getISOLanguages() 2365 { 2366 return uloc_getISOLanguages(); 2367 } 2368 2369 // Set the locale's data based on a posix id. 2370 void Locale::setFromPOSIXID(const char *posixID) 2371 { 2372 init(posixID, true); 2373 } 2374 2375 const Locale & U_EXPORT2 2376 Locale::getRoot() 2377 { 2378 return getLocale(eROOT); 2379 } 2380 2381 const Locale & U_EXPORT2 2382 Locale::getEnglish() 2383 { 2384 return getLocale(eENGLISH); 2385 } 2386 2387 const Locale & U_EXPORT2 2388 Locale::getFrench() 2389 { 2390 return getLocale(eFRENCH); 2391 } 2392 2393 const Locale & U_EXPORT2 2394 Locale::getGerman() 2395 { 2396 return getLocale(eGERMAN); 2397 } 2398 2399 const Locale & U_EXPORT2 2400 Locale::getItalian() 2401 { 2402 return getLocale(eITALIAN); 2403 } 2404 2405 const Locale & U_EXPORT2 2406 Locale::getJapanese() 2407 { 2408 return getLocale(eJAPANESE); 2409 } 2410 2411 const Locale & U_EXPORT2 2412 Locale::getKorean() 2413 { 2414 return getLocale(eKOREAN); 2415 } 2416 2417 const Locale & U_EXPORT2 2418 Locale::getChinese() 2419 { 2420 return getLocale(eCHINESE); 2421 } 2422 2423 const Locale & U_EXPORT2 2424 Locale::getSimplifiedChinese() 2425 { 2426 return getLocale(eCHINA); 2427 } 2428 2429 const Locale & U_EXPORT2 2430 Locale::getTraditionalChinese() 2431 { 2432 return getLocale(eTAIWAN); 2433 } 2434 2435 2436 const Locale & U_EXPORT2 2437 Locale::getFrance() 2438 { 2439 return getLocale(eFRANCE); 2440 } 2441 2442 const Locale & U_EXPORT2 2443 Locale::getGermany() 2444 { 2445 return getLocale(eGERMANY); 2446 } 2447 2448 const Locale & U_EXPORT2 2449 Locale::getItaly() 2450 { 2451 return getLocale(eITALY); 2452 } 2453 2454 const Locale & U_EXPORT2 2455 Locale::getJapan() 2456 { 2457 return getLocale(eJAPAN); 2458 } 2459 2460 const Locale & U_EXPORT2 2461 Locale::getKorea() 2462 { 2463 return getLocale(eKOREA); 2464 } 2465 2466 const Locale & U_EXPORT2 2467 Locale::getChina() 2468 { 2469 return getLocale(eCHINA); 2470 } 2471 2472 const Locale & U_EXPORT2 2473 Locale::getPRC() 2474 { 2475 return getLocale(eCHINA); 2476 } 2477 2478 const Locale & U_EXPORT2 2479 Locale::getTaiwan() 2480 { 2481 return getLocale(eTAIWAN); 2482 } 2483 2484 const Locale & U_EXPORT2 2485 Locale::getUK() 2486 { 2487 return getLocale(eUK); 2488 } 2489 2490 const Locale & U_EXPORT2 2491 Locale::getUS() 2492 { 2493 return getLocale(eUS); 2494 } 2495 2496 const Locale & U_EXPORT2 2497 Locale::getCanada() 2498 { 2499 return getLocale(eCANADA); 2500 } 2501 2502 const Locale & U_EXPORT2 2503 Locale::getCanadaFrench() 2504 { 2505 return getLocale(eCANADA_FRENCH); 2506 } 2507 2508 const Locale & 2509 Locale::getLocale(int locid) 2510 { 2511 Locale *localeCache = getLocaleCache(); 2512 U_ASSERT((locid < eMAX_LOCALES)&&(locid>=0)); 2513 if (localeCache == nullptr) { 2514 // Failure allocating the locale cache. 2515 // The best we can do is return a nullptr reference. 2516 locid = 0; 2517 } 2518 return localeCache[locid]; /*operating on nullptr*/ 2519 } 2520 2521 /* 2522 This function is defined this way in order to get around static 2523 initialization and static destruction. 2524 */ 2525 Locale * 2526 Locale::getLocaleCache() 2527 { 2528 UErrorCode status = U_ZERO_ERROR; 2529 umtx_initOnce(gLocaleCacheInitOnce, locale_init, status); 2530 return gLocaleCache; 2531 } 2532 2533 class KeywordEnumeration : public StringEnumeration { 2534 protected: 2535 FixedString keywords; 2536 private: 2537 int32_t length; 2538 const char *current; 2539 static const char fgClassID; 2540 2541 public: 2542 static UClassID U_EXPORT2 getStaticClassID() { return (UClassID)&fgClassID; } 2543 virtual UClassID getDynamicClassID() const override { return getStaticClassID(); } 2544 public: 2545 KeywordEnumeration(const char *keys, int32_t keywordLen, int32_t currentIndex, UErrorCode &status) 2546 : keywords(), length(keywordLen), current(nullptr) { 2547 if(U_SUCCESS(status) && keywordLen != 0) { 2548 if(keys == nullptr || keywordLen < 0) { 2549 status = U_ILLEGAL_ARGUMENT_ERROR; 2550 } else { 2551 keywords = {keys, static_cast<std::string_view::size_type>(length)}; 2552 if (keywords.isEmpty()) { 2553 status = U_MEMORY_ALLOCATION_ERROR; 2554 } else { 2555 current = keywords.data() + currentIndex; 2556 } 2557 } 2558 } 2559 } 2560 2561 virtual ~KeywordEnumeration(); 2562 2563 virtual StringEnumeration * clone() const override 2564 { 2565 UErrorCode status = U_ZERO_ERROR; 2566 return new KeywordEnumeration( 2567 keywords.data(), length, 2568 static_cast<int32_t>(current - keywords.data()), status); 2569 } 2570 2571 virtual int32_t count(UErrorCode& status) const override { 2572 if (U_FAILURE(status)) { return 0; } 2573 const char *kw = keywords.data(); 2574 int32_t result = 0; 2575 while(*kw) { 2576 result++; 2577 kw += uprv_strlen(kw)+1; 2578 } 2579 return result; 2580 } 2581 2582 virtual const char* next(int32_t* resultLength, UErrorCode& status) override { 2583 const char* result; 2584 int32_t len; 2585 if(U_SUCCESS(status) && *current != 0) { 2586 result = current; 2587 len = static_cast<int32_t>(uprv_strlen(current)); 2588 current += len+1; 2589 if(resultLength != nullptr) { 2590 *resultLength = len; 2591 } 2592 } else { 2593 if(resultLength != nullptr) { 2594 *resultLength = 0; 2595 } 2596 result = nullptr; 2597 } 2598 return result; 2599 } 2600 2601 virtual const UnicodeString* snext(UErrorCode& status) override { 2602 if (U_FAILURE(status)) { return nullptr; } 2603 int32_t resultLength = 0; 2604 const char *s = next(&resultLength, status); 2605 return setChars(s, resultLength, status); 2606 } 2607 2608 virtual void reset(UErrorCode& status) override { 2609 if (U_FAILURE(status)) { return; } 2610 current = keywords.data(); 2611 } 2612 }; 2613 2614 const char KeywordEnumeration::fgClassID = '\0'; 2615 2616 // Out-of-line virtual destructor to serve as the "key function". 2617 KeywordEnumeration::~KeywordEnumeration() = default; 2618 2619 // A wrapper around KeywordEnumeration that calls uloc_toUnicodeLocaleKey() in 2620 // the next() method for each keyword before returning it. 2621 class UnicodeKeywordEnumeration : public KeywordEnumeration { 2622 public: 2623 using KeywordEnumeration::KeywordEnumeration; 2624 virtual ~UnicodeKeywordEnumeration(); 2625 2626 virtual const char* next(int32_t* resultLength, UErrorCode& status) override { 2627 const char* legacy_key = KeywordEnumeration::next(nullptr, status); 2628 while (U_SUCCESS(status) && legacy_key != nullptr) { 2629 const char* key = uloc_toUnicodeLocaleKey(legacy_key); 2630 if (key != nullptr) { 2631 if (resultLength != nullptr) { 2632 *resultLength = static_cast<int32_t>(uprv_strlen(key)); 2633 } 2634 return key; 2635 } 2636 // Not a Unicode keyword, could be a t, x or other, continue to look at the next one. 2637 legacy_key = KeywordEnumeration::next(nullptr, status); 2638 } 2639 if (resultLength != nullptr) *resultLength = 0; 2640 return nullptr; 2641 } 2642 virtual int32_t count(UErrorCode& status) const override { 2643 if (U_FAILURE(status)) { return 0; } 2644 const char *kw = keywords.data(); 2645 int32_t result = 0; 2646 while(*kw) { 2647 if (uloc_toUnicodeLocaleKey(kw) != nullptr) { 2648 result++; 2649 } 2650 kw += uprv_strlen(kw)+1; 2651 } 2652 return result; 2653 } 2654 }; 2655 2656 // Out-of-line virtual destructor to serve as the "key function". 2657 UnicodeKeywordEnumeration::~UnicodeKeywordEnumeration() = default; 2658 2659 StringEnumeration * 2660 Locale::createKeywords(UErrorCode &status) const 2661 { 2662 StringEnumeration *result = nullptr; 2663 2664 if (U_FAILURE(status)) { 2665 return result; 2666 } 2667 2668 const char* variantStart = uprv_strchr(getName(), '@'); 2669 const char* assignment = uprv_strchr(getName(), '='); 2670 if(variantStart) { 2671 if(assignment > variantStart) { 2672 CharString keywords = ulocimp_getKeywords(variantStart + 1, '@', false, status); 2673 if (U_SUCCESS(status) && !keywords.isEmpty()) { 2674 result = new KeywordEnumeration(keywords.data(), keywords.length(), 0, status); 2675 if (!result) { 2676 status = U_MEMORY_ALLOCATION_ERROR; 2677 } 2678 } 2679 } else { 2680 status = U_INVALID_FORMAT_ERROR; 2681 } 2682 } 2683 return result; 2684 } 2685 2686 StringEnumeration * 2687 Locale::createUnicodeKeywords(UErrorCode &status) const 2688 { 2689 StringEnumeration *result = nullptr; 2690 2691 if (U_FAILURE(status)) { 2692 return result; 2693 } 2694 2695 const char* variantStart = uprv_strchr(getName(), '@'); 2696 const char* assignment = uprv_strchr(getName(), '='); 2697 if(variantStart) { 2698 if(assignment > variantStart) { 2699 CharString keywords = ulocimp_getKeywords(variantStart + 1, '@', false, status); 2700 if (U_SUCCESS(status) && !keywords.isEmpty()) { 2701 result = new UnicodeKeywordEnumeration(keywords.data(), keywords.length(), 0, status); 2702 if (!result) { 2703 status = U_MEMORY_ALLOCATION_ERROR; 2704 } 2705 } 2706 } else { 2707 status = U_INVALID_FORMAT_ERROR; 2708 } 2709 } 2710 return result; 2711 } 2712 2713 int32_t 2714 Locale::getKeywordValue(const char* keywordName, char *buffer, int32_t bufLen, UErrorCode &status) const 2715 { 2716 return uloc_getKeywordValue(getName(), keywordName, buffer, bufLen, &status); 2717 } 2718 2719 void 2720 Locale::getKeywordValue(StringPiece keywordName, ByteSink& sink, UErrorCode& status) const { 2721 if (U_FAILURE(status)) { 2722 return; 2723 } 2724 2725 if (isBogus()) { 2726 status = U_ILLEGAL_ARGUMENT_ERROR; 2727 return; 2728 } 2729 2730 ulocimp_getKeywordValue(getName(), keywordName, sink, status); 2731 } 2732 2733 void 2734 Locale::getUnicodeKeywordValue(StringPiece keywordName, 2735 ByteSink& sink, 2736 UErrorCode& status) const { 2737 if (U_FAILURE(status)) { 2738 return; 2739 } 2740 2741 std::optional<std::string_view> legacy_key = ulocimp_toLegacyKeyWithFallback(keywordName); 2742 if (!legacy_key.has_value()) { 2743 status = U_ILLEGAL_ARGUMENT_ERROR; 2744 return; 2745 } 2746 2747 auto legacy_value = getKeywordValue<CharString>(*legacy_key, status); 2748 2749 if (U_FAILURE(status)) { 2750 return; 2751 } 2752 2753 std::optional<std::string_view> unicode_value = 2754 ulocimp_toBcpTypeWithFallback(keywordName, legacy_value.toStringPiece()); 2755 if (!unicode_value.has_value()) { 2756 status = U_ILLEGAL_ARGUMENT_ERROR; 2757 return; 2758 } 2759 2760 sink.Append(unicode_value->data(), static_cast<int32_t>(unicode_value->size())); 2761 } 2762 2763 void 2764 Locale::setKeywordValue(StringPiece keywordName, 2765 StringPiece keywordValue, 2766 UErrorCode& status) { 2767 if (U_FAILURE(status)) { return; } 2768 if (keywordName.empty()) { 2769 status = U_ILLEGAL_ARGUMENT_ERROR; 2770 return; 2771 } 2772 if (status == U_STRING_NOT_TERMINATED_WARNING) { 2773 status = U_ZERO_ERROR; 2774 } 2775 2776 CharString localeID(getName(), -1, status); 2777 ulocimp_setKeywordValue(keywordName, keywordValue, localeID, status); 2778 if (U_FAILURE(status)) { 2779 if (status == U_MEMORY_ALLOCATION_ERROR) { 2780 setToBogus(); 2781 } 2782 return; 2783 } 2784 2785 const char* at = locale_getKeywordsStart(localeID.toStringPiece()); 2786 bool hasKeywords = at != nullptr && uprv_strchr(at + 1, '=') != nullptr; 2787 2788 Nest* nest = payload.get<Nest>(); 2789 if (!hasKeywords) { 2790 if (nest == nullptr) { 2791 // There are no longer any keywords left, so it might now be 2792 // possible to move the payload from Heap to Nest. 2793 Heap* heap = payload.get<Heap>(); 2794 U_ASSERT(heap != nullptr); 2795 if (Nest::fits(localeID.length(), heap->language, heap->script, heap->region)) { 2796 int32_t variantBegin = heap->ptr->variantBegin; 2797 U_ASSERT(variantBegin >= 0); 2798 U_ASSERT(static_cast<size_t>(variantBegin) < sizeof Nest::baseName); 2799 nest = &payload.emplace<Nest>(std::move(*heap), static_cast<uint8_t>(variantBegin)); 2800 localeID.extract(nest->baseName, sizeof Nest::baseName, status); 2801 } else { 2802 heap->ptr->baseName.clear(); 2803 heap->ptr->fullName = localeID.toStringPiece(); 2804 if (heap->ptr->fullName.isEmpty()) { 2805 status = U_MEMORY_ALLOCATION_ERROR; 2806 setToBogus(); 2807 return; 2808 } 2809 } 2810 } 2811 } else { 2812 Heap* heap = nullptr; 2813 if (nest != nullptr) { 2814 // A keyword has been added, so the payload now needs to be moved 2815 // from Nest to Heap so that it can get a baseName. 2816 Nest copy(*nest); 2817 heap = &payload.emplace<Heap>(copy.language, 2818 copy.script, 2819 copy.region, 2820 copy.variantBegin); 2821 if (isBogus()) { 2822 status = U_MEMORY_ALLOCATION_ERROR; 2823 return; 2824 } 2825 } else { 2826 heap = payload.get<Heap>(); 2827 } 2828 U_ASSERT(heap != nullptr); 2829 heap->ptr->fullName = localeID.toStringPiece(); 2830 if (heap->ptr->fullName.isEmpty()) { 2831 status = U_MEMORY_ALLOCATION_ERROR; 2832 setToBogus(); 2833 return; 2834 } 2835 2836 if (heap->ptr->baseName.isEmpty()) { 2837 // Has added the first keyword, meaning that the fullName is no longer also the baseName. 2838 if (std::string_view::size_type baseNameLength = at - localeID.data(); baseNameLength > 0) { 2839 heap->ptr->baseName = {heap->ptr->fullName.data(), baseNameLength}; 2840 if (heap->ptr->baseName.isEmpty()) { 2841 status = U_MEMORY_ALLOCATION_ERROR; 2842 setToBogus(); 2843 return; 2844 } 2845 } 2846 } 2847 } 2848 } 2849 2850 void 2851 Locale::setUnicodeKeywordValue(StringPiece keywordName, 2852 StringPiece keywordValue, 2853 UErrorCode& status) { 2854 if (U_FAILURE(status)) { 2855 return; 2856 } 2857 2858 std::optional<std::string_view> legacy_key = ulocimp_toLegacyKeyWithFallback(keywordName); 2859 if (!legacy_key.has_value()) { 2860 status = U_ILLEGAL_ARGUMENT_ERROR; 2861 return; 2862 } 2863 2864 std::string_view value; 2865 2866 if (!keywordValue.empty()) { 2867 std::optional<std::string_view> legacy_value = 2868 ulocimp_toLegacyTypeWithFallback(keywordName, keywordValue); 2869 if (!legacy_value.has_value()) { 2870 status = U_ILLEGAL_ARGUMENT_ERROR; 2871 return; 2872 } 2873 value = *legacy_value; 2874 } 2875 2876 setKeywordValue(*legacy_key, value, status); 2877 } 2878 2879 const char* 2880 Locale::getCountry() const { 2881 return getField<&Nest::getRegion, &Heap::getRegion>(); 2882 } 2883 2884 const char* 2885 Locale::getLanguage() const { 2886 return getField<&Nest::getLanguage, &Heap::getLanguage>(); 2887 } 2888 2889 const char* 2890 Locale::getScript() const { 2891 return getField<&Nest::getScript, &Heap::getScript>(); 2892 } 2893 2894 const char* 2895 Locale::getVariant() const { 2896 return getField<&Nest::getVariant, &Heap::getVariant>(); 2897 } 2898 2899 const char* 2900 Locale::getName() const { 2901 return getField<&Nest::getBaseName, &Heap::getFullName>(); 2902 } 2903 2904 const char* 2905 Locale::getBaseName() const { 2906 return getField<&Nest::getBaseName, &Heap::getBaseName>(); 2907 } 2908 2909 template <const char* (Locale::Nest::*const NEST)() const, 2910 const char* (Locale::Heap::*const HEAP)() const> 2911 const char* Locale::getField() const { 2912 return payload.visit([] { return ""; }, 2913 [](const Nest& nest) { return (nest.*NEST)(); }, 2914 [](const Heap& heap) { return (heap.*HEAP)(); }); 2915 } 2916 2917 Locale::Iterator::~Iterator() = default; 2918 2919 //eof 2920 U_NAMESPACE_END