tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

loclikely.cpp (16033B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1997-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  loclikely.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2010feb25
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Code for likely and minimized locale subtags, separated out from other .cpp files
     19 *   that then do not depend on resource bundle code and likely-subtags data.
     20 */
     21 
     22 #include <string_view>
     23 #include <utility>
     24 
     25 #include "unicode/bytestream.h"
     26 #include "unicode/utypes.h"
     27 #include "unicode/locid.h"
     28 #include "unicode/putil.h"
     29 #include "unicode/uchar.h"
     30 #include "unicode/uloc.h"
     31 #include "unicode/ures.h"
     32 #include "unicode/uscript.h"
     33 #include "bytesinkutil.h"
     34 #include "charstr.h"
     35 #include "cmemory.h"
     36 #include "cstring.h"
     37 #include "loclikelysubtags.h"
     38 #include "ulocimp.h"
     39 
     40 namespace {
     41 
     42 /**
     43 * Create a tag string from the supplied parameters.  The lang, script and region
     44 * parameters may be nullptr pointers. If they are, their corresponding length parameters
     45 * must be less than or equal to 0.
     46 *
     47 * If an illegal argument is provided, the function returns the error
     48 * U_ILLEGAL_ARGUMENT_ERROR.
     49 *
     50 * @param lang The language tag to use.
     51 * @param langLength The length of the language tag.
     52 * @param script The script tag to use.
     53 * @param scriptLength The length of the script tag.
     54 * @param region The region tag to use.
     55 * @param regionLength The length of the region tag.
     56 * @param variant The region tag to use.
     57 * @param variantLength The length of the region tag.
     58 * @param trailing Any trailing data to append to the new tag.
     59 * @param trailingLength The length of the trailing data.
     60 * @param sink The output sink receiving the tag string.
     61 * @param err A pointer to a UErrorCode for error reporting.
     62 **/
     63 void U_CALLCONV
     64 createTagStringWithAlternates(
     65    const char* lang,
     66    int32_t langLength,
     67    const char* script,
     68    int32_t scriptLength,
     69    const char* region,
     70    int32_t regionLength,
     71    const char* variant,
     72    int32_t variantLength,
     73    const char* trailing,
     74    int32_t trailingLength,
     75    icu::ByteSink& sink,
     76    UErrorCode& err) {
     77    if (U_FAILURE(err)) {
     78        return;
     79    }
     80 
     81    if (langLength >= ULOC_LANG_CAPACITY ||
     82            scriptLength >= ULOC_SCRIPT_CAPACITY ||
     83            regionLength >= ULOC_COUNTRY_CAPACITY) {
     84        err = U_ILLEGAL_ARGUMENT_ERROR;
     85        return;
     86    }
     87 
     88    if (langLength > 0) {
     89        sink.Append(lang, langLength);
     90    }
     91 
     92    if (scriptLength > 0) {
     93        sink.Append("_", 1);
     94        sink.Append(script, scriptLength);
     95    }
     96 
     97    if (regionLength > 0) {
     98        sink.Append("_", 1);
     99        sink.Append(region, regionLength);
    100    }
    101 
    102    if (variantLength > 0) {
    103        if (regionLength == 0) {
    104            /* extra separator is required */
    105            sink.Append("_", 1);
    106        }
    107        sink.Append("_", 1);
    108        sink.Append(variant, variantLength);
    109    }
    110 
    111    if (trailingLength > 0) {
    112        /*
    113         * Copy the trailing data into the supplied buffer.
    114         */
    115        sink.Append(trailing, trailingLength);
    116    }
    117 }
    118 
    119 bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) {
    120    int32_t count = 0;
    121    for (int32_t i = 0; i < variantLength; i++) {
    122        if (_isIDSeparator(variant[i])) {
    123            count = 0;
    124        } else if (count == 8) {
    125            return false;
    126        } else {
    127            count++;
    128        }
    129    }
    130    return true;
    131 }
    132 
    133 void
    134 _uloc_addLikelySubtags(const char* localeID,
    135                       icu::ByteSink& sink,
    136                       UErrorCode& err) {
    137    if (U_FAILURE(err)) {
    138        return;
    139    }
    140 
    141    if (localeID == nullptr) {
    142        err = U_ILLEGAL_ARGUMENT_ERROR;
    143        return;
    144    }
    145 
    146    icu::CharString lang;
    147    icu::CharString script;
    148    icu::CharString region;
    149    icu::CharString variant;
    150    const char* trailing = nullptr;
    151    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
    152    if (U_FAILURE(err)) {
    153        return;
    154    }
    155 
    156    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
    157        err = U_ILLEGAL_ARGUMENT_ERROR;
    158        return;
    159    }
    160 
    161    if (lang.length() == 4) {
    162        if (script.isEmpty()) {
    163            script = std::move(lang);
    164            lang.clear();
    165        } else {
    166            err = U_ILLEGAL_ARGUMENT_ERROR;
    167            return;
    168        }
    169    } else if (lang.length() > 8) {
    170        err = U_ILLEGAL_ARGUMENT_ERROR;
    171        return;
    172    }
    173 
    174    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
    175 
    176    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
    177    if (U_FAILURE(err)) {
    178        return;
    179    }
    180    // We need to keep l on the stack because lsr may point into internal
    181    // memory of l.
    182    icu::Locale l = icu::Locale::createFromName(localeID);
    183    if (l.isBogus()) {
    184        err = U_ILLEGAL_ARGUMENT_ERROR;
    185        return;
    186    }
    187    icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err);
    188    if (U_FAILURE(err)) {
    189        return;
    190    }
    191    const char* language = lsr.language;
    192    if (uprv_strcmp(language, "und") == 0) {
    193        language = "";
    194    }
    195    createTagStringWithAlternates(
    196        language,
    197        static_cast<int32_t>(uprv_strlen(language)),
    198        lsr.script,
    199        static_cast<int32_t>(uprv_strlen(lsr.script)),
    200        lsr.region,
    201        static_cast<int32_t>(uprv_strlen(lsr.region)),
    202        variant.data(),
    203        variant.length(),
    204        trailing,
    205        trailingLength,
    206        sink,
    207        err);
    208 }
    209 
    210 void
    211 _uloc_minimizeSubtags(const char* localeID,
    212                      icu::ByteSink& sink,
    213                      bool favorScript,
    214                      UErrorCode& err) {
    215    if (U_FAILURE(err)) {
    216        return;
    217    }
    218 
    219    if (localeID == nullptr) {
    220        err = U_ILLEGAL_ARGUMENT_ERROR;
    221        return;
    222    }
    223 
    224    icu::CharString lang;
    225    icu::CharString script;
    226    icu::CharString region;
    227    icu::CharString variant;
    228    const char* trailing = nullptr;
    229    ulocimp_getSubtags(localeID, &lang, &script, &region, &variant, &trailing, err);
    230    if (U_FAILURE(err)) {
    231        return;
    232    }
    233 
    234    if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) {
    235        err = U_ILLEGAL_ARGUMENT_ERROR;
    236        return;
    237    }
    238 
    239    int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
    240 
    241    const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
    242    if (U_FAILURE(err)) {
    243        return;
    244    }
    245    icu::LSR lsr = likelySubtags->minimizeSubtags(
    246        lang.toStringPiece(),
    247        script.toStringPiece(),
    248        region.toStringPiece(),
    249        favorScript,
    250        err);
    251    if (U_FAILURE(err)) {
    252        return;
    253    }
    254    const char* language = lsr.language;
    255    if (uprv_strcmp(language, "und") == 0) {
    256        language = "";
    257    }
    258    createTagStringWithAlternates(
    259        language,
    260        static_cast<int32_t>(uprv_strlen(language)),
    261        lsr.script,
    262        static_cast<int32_t>(uprv_strlen(lsr.script)),
    263        lsr.region,
    264        static_cast<int32_t>(uprv_strlen(lsr.region)),
    265        variant.data(),
    266        variant.length(),
    267        trailing,
    268        trailingLength,
    269        sink,
    270        err);
    271 }
    272 
    273 }  // namespace
    274 
    275 U_CAPI int32_t U_EXPORT2
    276 uloc_addLikelySubtags(const char* localeID,
    277                      char* maximizedLocaleID,
    278                      int32_t maximizedLocaleIDCapacity,
    279                      UErrorCode* status) {
    280    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
    281        maximizedLocaleID, maximizedLocaleIDCapacity,
    282        [&](icu::ByteSink& sink, UErrorCode& status) {
    283            ulocimp_addLikelySubtags(localeID, sink, status);
    284        },
    285        *status);
    286 }
    287 
    288 U_EXPORT icu::CharString
    289 ulocimp_addLikelySubtags(const char* localeID,
    290                         UErrorCode& status) {
    291    return icu::ByteSinkUtil::viaByteSinkToCharString(
    292        [&](icu::ByteSink& sink, UErrorCode& status) {
    293            ulocimp_addLikelySubtags(localeID, sink, status);
    294        },
    295        status);
    296 }
    297 
    298 U_EXPORT void
    299 ulocimp_addLikelySubtags(const char* localeID,
    300                         icu::ByteSink& sink,
    301                         UErrorCode& status) {
    302    if (U_FAILURE(status)) { return; }
    303    if (localeID == nullptr) {
    304        localeID = uloc_getDefault();
    305    }
    306    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
    307    _uloc_addLikelySubtags(localeBuffer.data(), sink, status);
    308 }
    309 
    310 U_CAPI int32_t U_EXPORT2
    311 uloc_minimizeSubtags(const char* localeID,
    312                     char* minimizedLocaleID,
    313                     int32_t minimizedLocaleIDCapacity,
    314                     UErrorCode* status) {
    315    return icu::ByteSinkUtil::viaByteSinkToTerminatedChars(
    316        minimizedLocaleID, minimizedLocaleIDCapacity,
    317        [&](icu::ByteSink& sink, UErrorCode& status) {
    318            ulocimp_minimizeSubtags(localeID, sink, false, status);
    319        },
    320        *status);
    321 }
    322 
    323 U_EXPORT icu::CharString
    324 ulocimp_minimizeSubtags(const char* localeID,
    325                        bool favorScript,
    326                        UErrorCode& status) {
    327    return icu::ByteSinkUtil::viaByteSinkToCharString(
    328        [&](icu::ByteSink& sink, UErrorCode& status) {
    329            ulocimp_minimizeSubtags(localeID, sink, favorScript, status);
    330        },
    331        status);
    332 }
    333 
    334 U_EXPORT void
    335 ulocimp_minimizeSubtags(const char* localeID,
    336                        icu::ByteSink& sink,
    337                        bool favorScript,
    338                        UErrorCode& status) {
    339    if (U_FAILURE(status)) { return; }
    340    if (localeID == nullptr) {
    341        localeID = uloc_getDefault();
    342    }
    343    icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status);
    344    _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status);
    345 }
    346 
    347 // Pairs of (language subtag, + or -) for finding out fast if common languages
    348 // are LTR (minus) or RTL (plus).
    349 static const char LANG_DIR_STRING[] =
    350        "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-";
    351 
    352 // Implemented here because this calls ulocimp_addLikelySubtags().
    353 U_CAPI UBool U_EXPORT2
    354 uloc_isRightToLeft(const char *locale) {
    355    UErrorCode errorCode = U_ZERO_ERROR;
    356    icu::CharString lang;
    357    icu::CharString script;
    358    ulocimp_getSubtags(
    359        locale == nullptr ? uloc_getDefault() : locale,
    360        &lang, &script, nullptr, nullptr, nullptr, errorCode);
    361    if (U_FAILURE(errorCode) || script.isEmpty()) {
    362        // Fastpath: We know the likely scripts and their writing direction
    363        // for some common languages.
    364        if (!lang.isEmpty()) {
    365            const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data());
    366            if (langPtr != nullptr) {
    367                switch (langPtr[lang.length()]) {
    368                case '-': return false;
    369                case '+': return true;
    370                default: break;  // partial match of a longer code
    371                }
    372            }
    373        }
    374        // Otherwise, find the likely script.
    375        errorCode = U_ZERO_ERROR;
    376        icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode);
    377        if (U_FAILURE(errorCode)) {
    378            return false;
    379        }
    380        ulocimp_getSubtags(likely.toStringPiece(), nullptr, &script, nullptr, nullptr, nullptr, errorCode);
    381        if (U_FAILURE(errorCode) || script.isEmpty()) {
    382            return false;
    383        }
    384    }
    385    UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data());
    386    return uscript_isRightToLeft(scriptCode);
    387 }
    388 
    389 U_NAMESPACE_BEGIN
    390 
    391 UBool
    392 Locale::isRightToLeft() const {
    393    return uloc_isRightToLeft(getBaseName());
    394 }
    395 
    396 U_NAMESPACE_END
    397 
    398 namespace {
    399 icu::CharString
    400 GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
    401    icu::CharString result;
    402    // First check for keyword value
    403    icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
    404    int32_t len = kw.length();
    405    // In UTS35
    406    //   type = alphanum{3,8} (sep alphanum{3,8})* ;
    407    // so we know the subdivision must fit the type already.
    408    //
    409    //   unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
    410    //   unicode_region_subtag = (alpha{2} | digit{3}) ;
    411    //   unicode_subdivision_suffix = alphanum{1,4} ;
    412    // But we also know there are no id in start with digit{3} in
    413    // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
    414    // Therefore we can simplify as
    415    // unicode_subdivision_id = alpha{2} alphanum{1,4}
    416    //
    417    // and only need to accept/reject the code based on the alpha{2} and the length.
    418    if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
    419        uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
    420        // Additional Check
    421        static icu::RegionValidateMap valid;
    422        const char region[] = {kw[0], kw[1], '\0'};
    423        if (valid.isSet(region)) {
    424            result.append(uprv_toupper(kw[0]), status);
    425            result.append(uprv_toupper(kw[1]), status);
    426        }
    427    }
    428    return result;
    429 }
    430 }  // namespace
    431 
    432 U_EXPORT icu::CharString
    433 ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
    434                                     UErrorCode& status) {
    435    if (U_FAILURE(status)) {
    436        return {};
    437    }
    438    icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status);
    439    if (U_SUCCESS(status) && rgBuf.isEmpty()) {
    440        // No valid rg keyword value, try for unicode_region_subtag
    441        rgBuf = ulocimp_getRegion(localeID == nullptr ? uloc_getDefault() : localeID, status);
    442        if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) {
    443            // Second check for sd keyword value
    444            rgBuf = GetRegionFromKey(localeID, "sd", status);
    445            if (U_SUCCESS(status) && rgBuf.isEmpty()) {
    446                // no unicode_region_subtag but inferRegion true, try likely subtags
    447                UErrorCode rgStatus = U_ZERO_ERROR;
    448                icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus);
    449                if (U_SUCCESS(rgStatus)) {
    450                    rgBuf = ulocimp_getRegion(locBuf.toStringPiece(), status);
    451                }
    452            }
    453        }
    454    }
    455 
    456    return rgBuf;
    457 }
    458 
    459 namespace {
    460 
    461 // The following data is generated by unit test code inside
    462 // test/intltest/regiontst.cpp from the resource data while
    463 // the test failed.
    464 const uint32_t gValidRegionMap[] = {
    465    0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 
    466    0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 
    467    0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 
    468    0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 
    469    0x0410419a, 0x00408557, 0x00004002, 0x00100001, 
    470    0x00400408, 0x00000001, 
    471 };
    472 
    473 }  // namespace
    474   //
    475 U_NAMESPACE_BEGIN
    476 RegionValidateMap::RegionValidateMap() {
    477    uprv_memcpy(map, gValidRegionMap, sizeof(map));
    478 }
    479 
    480 RegionValidateMap::~RegionValidateMap() {
    481 }
    482 
    483 bool RegionValidateMap::isSet(const char* region) const {
    484    int32_t index = value(region);
    485    if (index < 0) {
    486        return false;
    487    }
    488    return 0 != (map[index / 32] & (1L << (index % 32)));
    489 }
    490 
    491 bool RegionValidateMap::equals(const RegionValidateMap& that) const {
    492    return uprv_memcmp(map, that.map, sizeof(map)) == 0;
    493 }
    494 
    495 // The code transform two letter a-z to a integer valued between -1, 26x26.
    496 // -1 indicate the region is outside the range of two letter a-z
    497 // the rest of value is between 0 and 676 (= 26x26) and used as an index
    498 // the bigmap in map. The map is an array of 22 int32_t.
    499 // since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
    500 int32_t RegionValidateMap::value(const char* region) const {
    501    if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
    502        region[2] == '\0') {
    503        return (uprv_toupper(region[0])-'A') * 26 +
    504               (uprv_toupper(region[1])-'A');
    505    }
    506    return -1;
    507 }
    508 
    509 U_NAMESPACE_END