loclikely.cpp (16033B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1997-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: loclikely.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2010feb25 16 * created by: Markus W. Scherer 17 * 18 * Code for likely and minimized locale subtags, separated out from other .cpp files 19 * that then do not depend on resource bundle code and likely-subtags data. 20 */ 21 22 #include <string_view> 23 #include <utility> 24 25 #include "unicode/bytestream.h" 26 #include "unicode/utypes.h" 27 #include "unicode/locid.h" 28 #include "unicode/putil.h" 29 #include "unicode/uchar.h" 30 #include "unicode/uloc.h" 31 #include "unicode/ures.h" 32 #include "unicode/uscript.h" 33 #include "bytesinkutil.h" 34 #include "charstr.h" 35 #include "cmemory.h" 36 #include "cstring.h" 37 #include "loclikelysubtags.h" 38 #include "ulocimp.h" 39 40 namespace { 41 42 /** 43 * Create a tag string from the supplied parameters. The lang, script and region 44 * parameters may be nullptr pointers. If they are, their corresponding length parameters 45 * must be less than or equal to 0. 46 * 47 * If an illegal argument is provided, the function returns the error 48 * U_ILLEGAL_ARGUMENT_ERROR. 49 * 50 * @param lang The language tag to use. 51 * @param langLength The length of the language tag. 52 * @param script The script tag to use. 53 * @param scriptLength The length of the script tag. 54 * @param region The region tag to use. 55 * @param regionLength The length of the region tag. 56 * @param variant The region tag to use. 57 * @param variantLength The length of the region tag. 58 * @param trailing Any trailing data to append to the new tag. 59 * @param trailingLength The length of the trailing data. 60 * @param sink The output sink receiving the tag string. 61 * @param err A pointer to a UErrorCode for error reporting. 62 **/ 63 void U_CALLCONV 64 createTagStringWithAlternates( 65 const char* lang, 66 int32_t langLength, 67 const char* script, 68 int32_t scriptLength, 69 const char* region, 70 int32_t regionLength, 71 const char* variant, 72 int32_t variantLength, 73 const char* trailing, 74 int32_t trailingLength, 75 icu::ByteSink& sink, 76 UErrorCode& err) { 77 if (U_FAILURE(err)) { 78 return; 79 } 80 81 if (langLength >= ULOC_LANG_CAPACITY || 82 scriptLength >= ULOC_SCRIPT_CAPACITY || 83 regionLength >= ULOC_COUNTRY_CAPACITY) { 84 err = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 88 if (langLength > 0) { 89 sink.Append(lang, langLength); 90 } 91 92 if (scriptLength > 0) { 93 sink.Append("_", 1); 94 sink.Append(script, scriptLength); 95 } 96 97 if (regionLength > 0) { 98 sink.Append("_", 1); 99 sink.Append(region, regionLength); 100 } 101 102 if (variantLength > 0) { 103 if (regionLength == 0) { 104 /* extra separator is required */ 105 sink.Append("_", 1); 106 } 107 sink.Append("_", 1); 108 sink.Append(variant, variantLength); 109 } 110 111 if (trailingLength > 0) { 112 /* 113 * Copy the trailing data into the supplied buffer. 114 */ 115 sink.Append(trailing, trailingLength); 116 } 117 } 118 119 bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) { 120 int32_t count = 0; 121 for (int32_t i = 0; i < variantLength; i++) { 122 if (_isIDSeparator(variant[i])) { 123 count = 0; 124 } else if (count == 8) { 125 return false; 126 } else { 127 count++; 128 } 129 } 130 return true; 131 } 132 133 void 134 _uloc_addLikelySubtags(const char* localeID, 135 icu::ByteSink& sink, 136 UErrorCode& err) { 137 if (U_FAILURE(err)) { 138 return; 139 } 140 141 if (localeID == nullptr) { 142 err = U_ILLEGAL_ARGUMENT_ERROR; 143 return; 144 } 145 146 icu::CharString lang; 147 icu::CharString script; 148 icu::CharString region; 149 icu::CharString variant; 150 const char* trailing = nullptr; 151 ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err); 152 if (U_FAILURE(err)) { 153 return; 154 } 155 156 if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) { 157 err = U_ILLEGAL_ARGUMENT_ERROR; 158 return; 159 } 160 161 if (lang.length() == 4) { 162 if (script.isEmpty()) { 163 script = std::move(lang); 164 lang.clear(); 165 } else { 166 err = U_ILLEGAL_ARGUMENT_ERROR; 167 return; 168 } 169 } else if (lang.length() > 8) { 170 err = U_ILLEGAL_ARGUMENT_ERROR; 171 return; 172 } 173 174 int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing)); 175 176 const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); 177 if (U_FAILURE(err)) { 178 return; 179 } 180 // We need to keep l on the stack because lsr may point into internal 181 // memory of l. 182 icu::Locale l = icu::Locale::createFromName(localeID); 183 if (l.isBogus()) { 184 err = U_ILLEGAL_ARGUMENT_ERROR; 185 return; 186 } 187 icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err); 188 if (U_FAILURE(err)) { 189 return; 190 } 191 const char* language = lsr.language; 192 if (uprv_strcmp(language, "und") == 0) { 193 language = ""; 194 } 195 createTagStringWithAlternates( 196 language, 197 static_cast<int32_t>(uprv_strlen(language)), 198 lsr.script, 199 static_cast<int32_t>(uprv_strlen(lsr.script)), 200 lsr.region, 201 static_cast<int32_t>(uprv_strlen(lsr.region)), 202 variant.data(), 203 variant.length(), 204 trailing, 205 trailingLength, 206 sink, 207 err); 208 } 209 210 void 211 _uloc_minimizeSubtags(const char* localeID, 212 icu::ByteSink& sink, 213 bool favorScript, 214 UErrorCode& err) { 215 if (U_FAILURE(err)) { 216 return; 217 } 218 219 if (localeID == nullptr) { 220 err = U_ILLEGAL_ARGUMENT_ERROR; 221 return; 222 } 223 224 icu::CharString lang; 225 icu::CharString script; 226 icu::CharString region; 227 icu::CharString variant; 228 const char* trailing = nullptr; 229 ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err); 230 if (U_FAILURE(err)) { 231 return; 232 } 233 234 if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) { 235 err = U_ILLEGAL_ARGUMENT_ERROR; 236 return; 237 } 238 239 int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing)); 240 241 const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); 242 if (U_FAILURE(err)) { 243 return; 244 } 245 icu::LSR lsr = likelySubtags->minimizeSubtags( 246 lang.toStringPiece(), 247 script.toStringPiece(), 248 region.toStringPiece(), 249 favorScript, 250 err); 251 if (U_FAILURE(err)) { 252 return; 253 } 254 const char* language = lsr.language; 255 if (uprv_strcmp(language, "und") == 0) { 256 language = ""; 257 } 258 createTagStringWithAlternates( 259 language, 260 static_cast<int32_t>(uprv_strlen(language)), 261 lsr.script, 262 static_cast<int32_t>(uprv_strlen(lsr.script)), 263 lsr.region, 264 static_cast<int32_t>(uprv_strlen(lsr.region)), 265 variant.data(), 266 variant.length(), 267 trailing, 268 trailingLength, 269 sink, 270 err); 271 } 272 273 } // namespace 274 275 U_CAPI int32_t U_EXPORT2 276 uloc_addLikelySubtags(const char* localeID, 277 char* maximizedLocaleID, 278 int32_t maximizedLocaleIDCapacity, 279 UErrorCode* status) { 280 return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( 281 maximizedLocaleID, maximizedLocaleIDCapacity, 282 [&](icu::ByteSink& sink, UErrorCode& status) { 283 ulocimp_addLikelySubtags(localeID, sink, status); 284 }, 285 *status); 286 } 287 288 U_EXPORT icu::CharString 289 ulocimp_addLikelySubtags(const char* localeID, 290 UErrorCode& status) { 291 return icu::ByteSinkUtil::viaByteSinkToCharString( 292 [&](icu::ByteSink& sink, UErrorCode& status) { 293 ulocimp_addLikelySubtags(localeID, sink, status); 294 }, 295 status); 296 } 297 298 U_EXPORT void 299 ulocimp_addLikelySubtags(const char* localeID, 300 icu::ByteSink& sink, 301 UErrorCode& status) { 302 if (U_FAILURE(status)) { return; } 303 if (localeID == nullptr) { 304 localeID = uloc_getDefault(); 305 } 306 icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status); 307 _uloc_addLikelySubtags(localeBuffer.data(), sink, status); 308 } 309 310 U_CAPI int32_t U_EXPORT2 311 uloc_minimizeSubtags(const char* localeID, 312 char* minimizedLocaleID, 313 int32_t minimizedLocaleIDCapacity, 314 UErrorCode* status) { 315 return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( 316 minimizedLocaleID, minimizedLocaleIDCapacity, 317 [&](icu::ByteSink& sink, UErrorCode& status) { 318 ulocimp_minimizeSubtags(localeID, sink, false, status); 319 }, 320 *status); 321 } 322 323 U_EXPORT icu::CharString 324 ulocimp_minimizeSubtags(const char* localeID, 325 bool favorScript, 326 UErrorCode& status) { 327 return icu::ByteSinkUtil::viaByteSinkToCharString( 328 [&](icu::ByteSink& sink, UErrorCode& status) { 329 ulocimp_minimizeSubtags(localeID, sink, favorScript, status); 330 }, 331 status); 332 } 333 334 U_EXPORT void 335 ulocimp_minimizeSubtags(const char* localeID, 336 icu::ByteSink& sink, 337 bool favorScript, 338 UErrorCode& status) { 339 if (U_FAILURE(status)) { return; } 340 if (localeID == nullptr) { 341 localeID = uloc_getDefault(); 342 } 343 icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status); 344 _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status); 345 } 346 347 // Pairs of (language subtag, + or -) for finding out fast if common languages 348 // are LTR (minus) or RTL (plus). 349 static const char LANG_DIR_STRING[] = 350 "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-"; 351 352 // Implemented here because this calls ulocimp_addLikelySubtags(). 353 U_CAPI UBool U_EXPORT2 354 uloc_isRightToLeft(const char *locale) { 355 UErrorCode errorCode = U_ZERO_ERROR; 356 icu::CharString lang; 357 icu::CharString script; 358 ulocimp_getSubtags( 359 locale == nullptr ? uloc_getDefault() : locale, 360 &lang, &script, nullptr, nullptr, nullptr, errorCode); 361 if (U_FAILURE(errorCode) || script.isEmpty()) { 362 // Fastpath: We know the likely scripts and their writing direction 363 // for some common languages. 364 if (!lang.isEmpty()) { 365 const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data()); 366 if (langPtr != nullptr) { 367 switch (langPtr[lang.length()]) { 368 case '-': return false; 369 case '+': return true; 370 default: break; // partial match of a longer code 371 } 372 } 373 } 374 // Otherwise, find the likely script. 375 errorCode = U_ZERO_ERROR; 376 icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode); 377 if (U_FAILURE(errorCode)) { 378 return false; 379 } 380 ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode); 381 if (U_FAILURE(errorCode) || script.isEmpty()) { 382 return false; 383 } 384 } 385 UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data()); 386 return uscript_isRightToLeft(scriptCode); 387 } 388 389 U_NAMESPACE_BEGIN 390 391 UBool 392 Locale::isRightToLeft() const { 393 return uloc_isRightToLeft(getBaseName()); 394 } 395 396 U_NAMESPACE_END 397 398 namespace { 399 icu::CharString 400 GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) { 401 icu::CharString result; 402 // First check for keyword value 403 icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status); 404 int32_t len = kw.length(); 405 // In UTS35 406 // type = alphanum{3,8} (sep alphanum{3,8})* ; 407 // so we know the subdivision must fit the type already. 408 // 409 // unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ; 410 // unicode_region_subtag = (alpha{2} | digit{3}) ; 411 // unicode_subdivision_suffix = alphanum{1,4} ; 412 // But we also know there are no id in start with digit{3} in 413 // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml 414 // Therefore we can simplify as 415 // unicode_subdivision_id = alpha{2} alphanum{1,4} 416 // 417 // and only need to accept/reject the code based on the alpha{2} and the length. 418 if (U_SUCCESS(status) && len >= 3 && len <= 6 && 419 uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) { 420 // Additional Check 421 static icu::RegionValidateMap valid; 422 const char region[] = {kw[0], kw[1], '\0'}; 423 if (valid.isSet(region)) { 424 result.append(uprv_toupper(kw[0]), status); 425 result.append(uprv_toupper(kw[1]), status); 426 } 427 } 428 return result; 429 } 430 } // namespace 431 432 U_EXPORT icu::CharString 433 ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion, 434 UErrorCode& status) { 435 if (U_FAILURE(status)) { 436 return {}; 437 } 438 icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status); 439 if (U_SUCCESS(status) && rgBuf.isEmpty()) { 440 // No valid rg keyword value, try for unicode_region_subtag 441 rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status); 442 if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) { 443 // Second check for sd keyword value 444 rgBuf = GetRegionFromKey(localeID, "sd", status); 445 if (U_SUCCESS(status) && rgBuf.isEmpty()) { 446 // no unicode_region_subtag but inferRegion true, try likely subtags 447 UErrorCode rgStatus = U_ZERO_ERROR; 448 icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus); 449 if (U_SUCCESS(rgStatus)) { 450 rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status); 451 } 452 } 453 } 454 } 455 456 return rgBuf; 457 } 458 459 namespace { 460 461 // The following data is generated by unit test code inside 462 // test/intltest/regiontst.cpp from the resource data while 463 // the test failed. 464 const uint32_t gValidRegionMap[] = { 465 0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 466 0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 467 0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 468 0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 469 0x0410419a, 0x00408557, 0x00004002, 0x00100001, 470 0x00400408, 0x00000001, 471 }; 472 473 } // namespace 474 // 475 U_NAMESPACE_BEGIN 476 RegionValidateMap::RegionValidateMap() { 477 uprv_memcpy(map, gValidRegionMap, sizeof(map)); 478 } 479 480 RegionValidateMap::~RegionValidateMap() { 481 } 482 483 bool RegionValidateMap::isSet(const char* region) const { 484 int32_t index = value(region); 485 if (index < 0) { 486 return false; 487 } 488 return 0 != (map[index / 32] & (1L << (index % 32))); 489 } 490 491 bool RegionValidateMap::equals(const RegionValidateMap& that) const { 492 return uprv_memcmp(map, that.map, sizeof(map)) == 0; 493 } 494 495 // The code transform two letter a-z to a integer valued between -1, 26x26. 496 // -1 indicate the region is outside the range of two letter a-z 497 // the rest of value is between 0 and 676 (= 26x26) and used as an index 498 // the bigmap in map. The map is an array of 22 int32_t. 499 // since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t. 500 int32_t RegionValidateMap::value(const char* region) const { 501 if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) && 502 region[2] == '\0') { 503 return (uprv_toupper(region[0])-'A') * 26 + 504 (uprv_toupper(region[1])-'A'); 505 } 506 return -1; 507 } 508 509 U_NAMESPACE_END