[ tor-browser ].git.dasho

uloc.cpp (90790B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1997-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File ULOC.CPP
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   04/01/97    aliu        Creation.
     15 *   08/21/98    stephen     JDK 1.2 sync
     16 *   12/08/98    rtg         New Locale implementation and C API
     17 *   03/15/99    damiba      overhaul.
     18 *   04/06/99    stephen     changed setDefault() to realloc and copy
     19 *   06/14/99    stephen     Changed calls to ures_open for new params
     20 *   07/21/99    stephen     Modified setDefault() to propagate to C++
     21 *   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
     22 *                           brought canonicalization code into line with spec
     23 *****************************************************************************/
     24 
     25 /*
     26   POSIX's locale format, from putil.c: [no spaces]
     27 
     28     ll [ _CC ] [ . MM ] [ @ VV]
     29 
     30     l = lang, C = ctry, M = charmap, V = variant
     31 */
     32 
     33 #include <algorithm>
     34 #include <optional>
     35 #include <string_view>
     36 
     37 #include "unicode/bytestream.h"
     38 #include "unicode/errorcode.h"
     39 #include "unicode/stringpiece.h"
     40 #include "unicode/utypes.h"
     41 #include "unicode/ustring.h"
     42 #include "unicode/uloc.h"
     43 
     44 #include "bytesinkutil.h"
     45 #include "putilimp.h"
     46 #include "ustr_imp.h"
     47 #include "ulocimp.h"
     48 #include "umutex.h"
     49 #include "cstring.h"
     50 #include "cmemory.h"
     51 #include "locmap.h"
     52 #include "uarrsort.h"
     53 #include "uenumimp.h"
     54 #include "uassert.h"
     55 #include "charstr.h"
     56 
     57 U_NAMESPACE_USE
     58 
     59 /* ### Declarations **************************************************/
     60 
     61 /* Locale stuff from locid.cpp */
     62 U_CFUNC void locale_set_default(const char *id);
     63 U_CFUNC const char *locale_get_default();
     64 
     65 namespace {
     66 
     67 /* ### Data tables **************************************************/
     68 
     69 /**
     70 * Table of language codes, both 2- and 3-letter, with preference
     71 * given to 2-letter codes where possible.  Includes 3-letter codes
     72 * that lack a 2-letter equivalent.
     73 *
     74 * This list must be in sorted order.  This list is returned directly
     75 * to the user by some API.
     76 *
     77 * This list must be kept in sync with LANGUAGES_3, with corresponding
     78 * entries matched.
     79 *
     80 * This table should be terminated with a nullptr entry, followed by a
     81 * second list, and another nullptr entry.  The first list is visible to
     82 * user code when this array is returned by API.  The second list
     83 * contains codes we support, but do not expose through user API.
     84 *
     85 * Notes
     86 *
     87 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
     88 * include the revisions up to 2001/7/27 *CWB*
     89 *
     90 * The 3 character codes are the terminology codes like RFC 3066.  This
     91 * is compatible with prior ICU codes
     92 *
     93 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
     94 * table but now at the end of the table because 3 character codes are
     95 * duplicates.  This avoids bad searches going from 3 to 2 character
     96 * codes.
     97 *
     98 * The range qaa-qtz is reserved for local use
     99 */
    100 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
    101 /* ISO639 table version is 20150505 */
    102 /* Subsequent hand addition of selected languages */
    103 constexpr const char* LANGUAGES[] = {
    104    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
    105    "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
    106    "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
    107    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as",
    108    "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
    109    "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
    110    "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
    111    "bgc", "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
    112    "blo", "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
    113    "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
    114    "ca",  "cad", "car", "cay", "cch", "ccp", "ce",  "ceb", "cgg",
    115    "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
    116    "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
    117    "cs",  "csb", "csw", "cu",  "cv",  "cy",
    118    "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
    119    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
    120    "dyo", "dyu", "dz",  "dzg",
    121    "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
    122    "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
    123    "ext",
    124    "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
    125    "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
    126    "frs", "fur", "fy",
    127    "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
    128    "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
    129    "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
    130    "gur", "guz", "gv",  "gwi",
    131    "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
    132    "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
    133    "hup", "hy",  "hz",
    134    "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
    135    "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
    136    "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
    137    "jv",
    138    "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
    139    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
    140    "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
    141    "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
    142    "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
    143    "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
    144    "kv",  "kw",  "kxv", "ky",
    145    "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
    146    "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
    147    "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
    148    "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
    149    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
    150    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
    151    "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
    152    "ml",  "mn",  "mnc", "mni",
    153    "moh", "mos", "mr",  "mrj",
    154    "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
    155    "my",  "mye", "myv", "mzn",
    156    "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
    157    "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
    158    "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
    159    "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
    160    "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
    161    "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
    162    "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
    163    "pon", "prg", "pro", "ps",  "pt",
    164    "qu",  "quc", "qug",
    165    "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
    166    "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
    167    "rw",  "rwk",
    168    "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
    169    "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
    170    "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
    171    "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
    172    "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
    173    "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
    174    "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
    175    "sv",  "sw",  "swb", "syc", "syr", "szl",
    176    "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
    177    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr",
    178    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tok", "tpi",
    179    "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
    180    "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
    181    "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
    182    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vmw",
    183    "vo", "vot", "vro", "vun",
    184    "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
    185    "xal", "xh",  "xmf", "xnr", "xog",
    186    "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
    187    "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
    188    "zun", "zxx", "zza",
    189 nullptr,
    190    "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  /* obsolete language codes */
    191 nullptr
    192 };
    193 
    194 constexpr const char* DEPRECATED_LANGUAGES[]={
    195    "in", "iw", "ji", "jw", "mo", nullptr, nullptr
    196 };
    197 constexpr const char* REPLACEMENT_LANGUAGES[]={
    198    "id", "he", "yi", "jv", "ro", nullptr, nullptr
    199 };
    200 
    201 /**
    202 * Table of 3-letter language codes.
    203 *
    204 * This is a lookup table used to convert 3-letter language codes to
    205 * their 2-letter equivalent, where possible.  It must be kept in sync
    206 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
    207 * same language as LANGUAGES_3[i].  The commented-out lines are
    208 * copied from LANGUAGES to make eyeballing this baby easier.
    209 *
    210 * Where a 3-letter language code has no 2-letter equivalent, the
    211 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
    212 *
    213 * This table should be terminated with a nullptr entry, followed by a
    214 * second list, and another nullptr entry.  The two lists correspond to
    215 * the two lists in LANGUAGES.
    216 */
    217 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
    218 /* ISO639 table version is 20150505 */
    219 /* Subsequent hand addition of selected languages */
    220 constexpr const char* LANGUAGES_3[] = {
    221    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
    222    "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
    223    "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
    224    "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm",
    225    "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
    226    "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
    227    "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
    228    "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
    229    "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
    230    "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
    231    "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg",
    232    "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
    233    "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
    234    "ces", "csb", "csw", "chu", "chv", "cym",
    235    "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
    236    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
    237    "dyo", "dyu", "dzo", "dzg",
    238    "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
    239    "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
    240    "ext",
    241    "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
    242    "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
    243    "frs", "fur", "fry",
    244    "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
    245    "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
    246    "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
    247    "gur", "guz", "glv", "gwi",
    248    "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
    249    "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
    250    "hup", "hye", "her",
    251    "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
    252    "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
    253    "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
    254    "jav",
    255    "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
    256    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
    257    "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
    258    "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
    259    "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
    260    "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
    261    "kom", "cor", "kxv", "kir",
    262    "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
    263    "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
    264    "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
    265    "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
    266    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
    267    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
    268    "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
    269    "mal", "mon", "mnc", "mni",
    270    "moh", "mos", "mar", "mrj",
    271    "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
    272    "mya", "mye", "myv", "mzn",
    273    "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
    274    "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
    275    "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
    276    "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
    277    "oci", "oji", "orm", "ori", "oss", "osa", "ota",
    278    "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc",
    279    "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
    280    "pon", "prg", "pro", "pus", "por",
    281    "que", "quc", "qug",
    282    "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
    283    "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
    284    "kin", "rwk",
    285    "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
    286    "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
    287    "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
    288    "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
    289    "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
    290    "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
    291    "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
    292    "swe", "swa", "swb", "syc", "syr", "szl",
    293    "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
    294    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr",
    295    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi",
    296    "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
    297    "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
    298    "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
    299    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw",
    300    "vol", "vot", "vro", "vun",
    301    "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
    302    "xal", "xho", "xmf", "xnr", "xog",
    303    "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
    304    "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
    305    "zun", "zxx", "zza",
    306 nullptr,
    307 /*  "in",  "iw",  "ji",  "jw",  "mo",  "sh",  "swc", "tl",  */
    308    "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl",
    309 nullptr
    310 };
    311 
    312 /**
    313 * Table of 2-letter country codes.
    314 *
    315 * This list must be in sorted order.  This list is returned directly
    316 * to the user by some API.
    317 *
    318 * This list must be kept in sync with COUNTRIES_3, with corresponding
    319 * entries matched.
    320 *
    321 * This table should be terminated with a nullptr entry, followed by a
    322 * second list, and another nullptr entry.  The first list is visible to
    323 * user code when this array is returned by API.  The second list
    324 * contains codes we support, but do not expose through user API.
    325 *
    326 * Notes:
    327 *
    328 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
    329 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
    330 * new codes keeping the old ones for compatibility updated to include
    331 * 1999/12/03 revisions *CWB*
    332 *
    333 * RO(ROM) is now RO(ROU) according to
    334 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
    335 */
    336 constexpr const char* COUNTRIES[] = {
    337    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
    338    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
    339    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
    340    "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
    341    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
    342    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",
    343    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",
    344    "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",
    345    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
    346    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
    347    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
    348    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
    349    "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
    350    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
    351    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
    352    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
    353    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
    354    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
    355    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
    356    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
    357    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
    358    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
    359    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
    360    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
    361    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
    362    "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
    363    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
    364    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
    365    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
    366    "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
    367 nullptr,
    368    "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
    369 nullptr
    370 };
    371 
    372 constexpr const char* DEPRECATED_COUNTRIES[] = {
    373    "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */
    374 };
    375 constexpr const char* REPLACEMENT_COUNTRIES[] = {
    376 /*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
    377    "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr  /* replacement country codes */
    378 };
    379 
    380 /**
    381 * Table of 3-letter country codes.
    382 *
    383 * This is a lookup table used to convert 3-letter country codes to
    384 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
    385 * For all valid i, COUNTRIES[i] must refer to the same country as
    386 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
    387 * to make eyeballing this baby easier.
    388 *
    389 * This table should be terminated with a nullptr entry, followed by a
    390 * second list, and another nullptr entry.  The two lists correspond to
    391 * the two lists in COUNTRIES.
    392 */
    393 constexpr const char* COUNTRIES_3[] = {
    394 /*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
    395    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
    396 /*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
    397    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
    398 /*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
    399    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
    400 /*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
    401    "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
    402 /*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
    403    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
    404 /*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CQ",  "CR",     */
    405    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI",
    406 /*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DG",  "DJ",  "DK",     */
    407    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK",
    408 /*  "DM",  "DO",  "DZ",  "EA",  "EC",  "EE",  "EG",  "EH",  "ER",     */
    409    "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI",
    410 /*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
    411    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
    412 /*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
    413    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
    414 /*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
    415    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
    416 /*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
    417    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
    418 /*  "IC",  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
    419    "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
    420 /*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
    421    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
    422 /*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
    423    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
    424 /*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
    425    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
    426 /*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
    427    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
    428 /*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
    429    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
    430 /*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
    431    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
    432 /*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
    433    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
    434 /*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
    435    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
    436 /*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
    437    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
    438 /*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
    439    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
    440 /*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
    441    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
    442 /*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
    443    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
    444 /*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
    445    "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
    446 /*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
    447    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
    448 /*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
    449    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
    450 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
    451    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
    452 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
    453    "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
    454 nullptr,
    455 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
    456    "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
    457 nullptr
    458 };
    459 
    460 typedef struct CanonicalizationMap {
    461    const char *id;          /* input ID */
    462    const char *canonicalID; /* canonicalized output ID */
    463 } CanonicalizationMap;
    464 
    465 /**
    466 * A map to canonicalize locale IDs.  This handles a variety of
    467 * different semantic kinds of transformations.
    468 */
    469 constexpr CanonicalizationMap CANONICALIZE_MAP[] = {
    470    { "art__LOJBAN",    "jbo" }, /* registered name */
    471    { "hy__AREVELA",    "hy" }, /* Registered IANA variant */
    472    { "hy__AREVMDA",    "hyw" }, /* Registered IANA variant */
    473    { "zh__GUOYU",      "zh" }, /* registered name */
    474    { "zh__HAKKA",      "hak" }, /* registered name */
    475    { "zh__XIANG",      "hsn" }, /* registered name */
    476    // subtags with 3 chars won't be treated as variants.
    477    { "zh_GAN",         "gan" }, /* registered name */
    478    { "zh_MIN_NAN",     "nan" }, /* registered name */
    479    { "zh_WUU",         "wuu" }, /* registered name */
    480    { "zh_YUE",         "yue" }, /* registered name */
    481 };
    482 
    483 /* ### BCP47 Conversion *******************************************/
    484 /* Gets the size of the shortest subtag in the given localeID. */
    485 int32_t getShortestSubtagLength(std::string_view localeID) {
    486    int32_t localeIDLength = static_cast<int32_t>(localeID.length());
    487    int32_t length = localeIDLength;
    488    int32_t tmpLength = 0;
    489    int32_t i;
    490    bool reset = true;
    491 
    492    for (i = 0; i < localeIDLength; i++) {
    493        if (localeID[i] != '_' && localeID[i] != '-') {
    494            if (reset) {
    495                tmpLength = 0;
    496                reset = false;
    497            }
    498            tmpLength++;
    499        } else {
    500            if (tmpLength != 0 && tmpLength < length) {
    501                length = tmpLength;
    502            }
    503            reset = true;
    504        }
    505    }
    506 
    507    return length;
    508 }
    509 /* Test if the locale id has BCP47 u extension and does not have '@' */
    510 inline bool _hasBCP47Extension(std::string_view id) {
    511    return id.find('@') == std::string_view::npos && getShortestSubtagLength(id) == 1;
    512 }
    513 
    514 /* ### Keywords **************************************************/
    515 inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; }
    516 inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); }
    517 /* Punctuation/symbols allowed in legacy key values */
    518 inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; }
    519 
    520 }  // namespace
    521 
    522 #define ULOC_KEYWORD_BUFFER_LEN 25
    523 #define ULOC_MAX_NO_KEYWORDS 25
    524 
    525 U_CAPI const char * U_EXPORT2
    526 locale_getKeywordsStart(std::string_view localeID) {
    527    if (size_t pos = localeID.find('@'); pos != std::string_view::npos) {
    528        return localeID.data() + pos;
    529    }
    530 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    531    else {
    532        /* We do this because the @ sign is variant, and the @ sign used on one
    533        EBCDIC machine won't be compiled the same way on other EBCDIC based
    534        machines. */
    535        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
    536        const uint8_t *charToFind = ebcdicSigns;
    537        while(*charToFind) {
    538            if (size_t pos = localeID.find(*charToFind); pos != std::string_view::npos) {
    539                return localeID.data() + pos;
    540            }
    541            charToFind++;
    542        }
    543    }
    544 #endif
    545    return nullptr;
    546 }
    547 
    548 namespace {
    549 
    550 /**
    551 * @param keywordName incoming name to be canonicalized
    552 * @param status return status (keyword too long)
    553 * @return the keyword name
    554 */
    555 CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status)
    556 {
    557  if (U_FAILURE(status)) { return {}; }
    558  CharString result;
    559 
    560  for (char c : keywordName) {
    561    if (!UPRV_ISALPHANUM(c)) {
    562      status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
    563      return {};
    564    }
    565    result.append(uprv_tolower(c), status);
    566  }
    567  if (result.isEmpty()) {
    568    status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */
    569    return {};
    570  }
    571 
    572  return result;
    573 }
    574 
    575 typedef struct {
    576    char keyword[ULOC_KEYWORD_BUFFER_LEN];
    577    int32_t keywordLen;
    578    const char *valueStart;
    579    int32_t valueLen;
    580 } KeywordStruct;
    581 
    582 int32_t U_CALLCONV
    583 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
    584    const char* leftString = static_cast<const KeywordStruct*>(left)->keyword;
    585    const char* rightString = static_cast<const KeywordStruct*>(right)->keyword;
    586    return uprv_strcmp(leftString, rightString);
    587 }
    588 
    589 }  // namespace
    590 
    591 U_EXPORT CharString
    592 ulocimp_getKeywords(std::string_view localeID,
    593                    char prev,
    594                    bool valuesToo,
    595                    UErrorCode& status)
    596 {
    597    return ByteSinkUtil::viaByteSinkToCharString(
    598        [&](ByteSink& sink, UErrorCode& status) {
    599            ulocimp_getKeywords(localeID,
    600                                prev,
    601                                sink,
    602                                valuesToo,
    603                                status);
    604        },
    605        status);
    606 }
    607 
    608 U_EXPORT void
    609 ulocimp_getKeywords(std::string_view localeID,
    610                    char prev,
    611                    ByteSink& sink,
    612                    bool valuesToo,
    613                    UErrorCode& status)
    614 {
    615    if (U_FAILURE(status)) { return; }
    616 
    617    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
    618 
    619    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
    620    int32_t numKeywords = 0;
    621    size_t equalSign = std::string_view::npos;
    622    size_t semicolon = std::string_view::npos;
    623    int32_t i = 0, j, n;
    624 
    625    if(prev == '@') { /* start of keyword definition */
    626        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
    627        do {
    628            bool duplicate = false;
    629            /* skip leading spaces */
    630            while (!localeID.empty() && localeID.front() == ' ') {
    631                localeID.remove_prefix(1);
    632            }
    633            if (localeID.empty()) { /* handle trailing "; " */
    634                break;
    635            }
    636            if(numKeywords == maxKeywords) {
    637                status = U_INTERNAL_PROGRAM_ERROR;
    638                return;
    639            }
    640            equalSign = localeID.find('=');
    641            semicolon = localeID.find(';');
    642            /* lack of '=' [foo@currency] is illegal */
    643            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
    644            if (equalSign == std::string_view::npos ||
    645                (semicolon != std::string_view::npos && semicolon < equalSign)) {
    646                status = U_INVALID_FORMAT_ERROR;
    647                return;
    648            }
    649            /* zero-length keyword is an error. */
    650            if (equalSign == 0) {
    651                status = U_INVALID_FORMAT_ERROR;
    652                return;
    653            }
    654            /* need to normalize both keyword and keyword name */
    655            if (equalSign >= ULOC_KEYWORD_BUFFER_LEN) {
    656                /* keyword name too long for internal buffer */
    657                status = U_INTERNAL_PROGRAM_ERROR;
    658                return;
    659            }
    660            for (i = 0, n = 0; static_cast<size_t>(i) < equalSign; ++i) {
    661                if (localeID[i] != ' ') {
    662                    keywordList[numKeywords].keyword[n++] = uprv_tolower(localeID[i]);
    663                }
    664            }
    665 
    666            keywordList[numKeywords].keyword[n] = 0;
    667            keywordList[numKeywords].keywordLen = n;
    668            /* now grab the value part. First we skip the '=' */
    669            equalSign++;
    670            /* then we leading spaces */
    671            while (equalSign < localeID.length() && localeID[equalSign] == ' ') {
    672                equalSign++;
    673            }
    674 
    675            /* Premature end or zero-length value */
    676            if (equalSign == localeID.length() || equalSign == semicolon) {
    677                status = U_INVALID_FORMAT_ERROR;
    678                return;
    679            }
    680 
    681            keywordList[numKeywords].valueStart = localeID.data() + equalSign;
    682 
    683            std::string_view value = localeID;
    684            if (semicolon != std::string_view::npos) {
    685                value.remove_suffix(value.length() - semicolon);
    686                localeID.remove_prefix(semicolon + 1);
    687            } else {
    688                localeID = {};
    689            }
    690            value.remove_prefix(equalSign);
    691            if (size_t last = value.find_last_not_of(' '); last != std::string_view::npos) {
    692                value.remove_suffix(value.length() - last - 1);
    693            }
    694            keywordList[numKeywords].valueLen = static_cast<int32_t>(value.length());
    695 
    696            /* If this is a duplicate keyword, then ignore it */
    697            for (j=0; j<numKeywords; ++j) {
    698                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
    699                    duplicate = true;
    700                    break;
    701                }
    702            }
    703            if (!duplicate) {
    704                ++numKeywords;
    705            }
    706        } while (!localeID.empty());
    707 
    708        /* now we have a list of keywords */
    709        /* we need to sort it */
    710        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status);
    711 
    712        /* Now construct the keyword part */
    713        for(i = 0; i < numKeywords; i++) {
    714            sink.Append(keywordList[i].keyword, keywordList[i].keywordLen);
    715            if(valuesToo) {
    716                sink.Append("=", 1);
    717                sink.Append(keywordList[i].valueStart, keywordList[i].valueLen);
    718                if(i < numKeywords - 1) {
    719                    sink.Append(";", 1);
    720                }
    721            } else {
    722                sink.Append("\0", 1);
    723            }
    724        }
    725    }
    726 }
    727 
    728 U_CAPI int32_t U_EXPORT2
    729 uloc_getKeywordValue(const char* localeID,
    730                     const char* keywordName,
    731                     char* buffer, int32_t bufferCapacity,
    732                     UErrorCode* status)
    733 {
    734    if (U_FAILURE(*status)) { return 0; }
    735    if (keywordName == nullptr || *keywordName == '\0') {
    736        *status = U_ILLEGAL_ARGUMENT_ERROR;
    737        return 0;
    738    }
    739    return ByteSinkUtil::viaByteSinkToTerminatedChars(
    740        buffer, bufferCapacity,
    741        [&](ByteSink& sink, UErrorCode& status) {
    742            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
    743        },
    744        *status);
    745 }
    746 
    747 U_EXPORT CharString
    748 ulocimp_getKeywordValue(const char* localeID,
    749                        std::string_view keywordName,
    750                        UErrorCode& status)
    751 {
    752    return ByteSinkUtil::viaByteSinkToCharString(
    753        [&](ByteSink& sink, UErrorCode& status) {
    754            ulocimp_getKeywordValue(localeID, keywordName, sink, status);
    755        },
    756        status);
    757 }
    758 
    759 U_EXPORT void
    760 ulocimp_getKeywordValue(const char* localeID,
    761                        std::string_view keywordName,
    762                        icu::ByteSink& sink,
    763                        UErrorCode& status)
    764 {
    765    if (U_FAILURE(status)) { return; }
    766 
    767    if (localeID == nullptr || keywordName.empty()) {
    768        status = U_ILLEGAL_ARGUMENT_ERROR;
    769        return;
    770    }
    771 
    772    const char* startSearchHere = nullptr;
    773    const char* nextSeparator = nullptr;
    774 
    775    CharString tempBuffer;
    776    const char* tmpLocaleID;
    777 
    778    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
    779    if (U_FAILURE(status)) {
    780      return;
    781    }
    782 
    783    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
    784        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status);
    785        tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
    786    } else {
    787        tmpLocaleID=localeID;
    788    }
    789 
    790    startSearchHere = locale_getKeywordsStart(tmpLocaleID);
    791    if(startSearchHere == nullptr) {
    792        /* no keywords, return at once */
    793        return;
    794    }
    795 
    796    /* find the first keyword */
    797    while(startSearchHere) {
    798        const char* keyValueTail;
    799 
    800        startSearchHere++; /* skip @ or ; */
    801        nextSeparator = uprv_strchr(startSearchHere, '=');
    802        if(!nextSeparator) {
    803            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
    804            return;
    805        }
    806        /* strip leading & trailing spaces (TC decided to tolerate these) */
    807        while(*startSearchHere == ' ') {
    808            startSearchHere++;
    809        }
    810        keyValueTail = nextSeparator;
    811        while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') {
    812            keyValueTail--;
    813        }
    814        /* now keyValueTail points to first char after the keyName */
    815        /* copy & normalize keyName from locale */
    816        if (startSearchHere == keyValueTail) {
    817            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
    818            return;
    819        }
    820        CharString localeKeywordName;
    821        while (startSearchHere < keyValueTail) {
    822          if (!UPRV_ISALPHANUM(*startSearchHere)) {
    823            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
    824            return;
    825          }
    826          localeKeywordName.append(uprv_tolower(*startSearchHere++), status);
    827        }
    828        if (U_FAILURE(status)) {
    829            return;
    830        }
    831 
    832        startSearchHere = uprv_strchr(nextSeparator, ';');
    833 
    834        if (canonKeywordName == localeKeywordName) {
    835             /* current entry matches the keyword. */
    836           nextSeparator++; /* skip '=' */
    837            /* First strip leading & trailing spaces (TC decided to tolerate these) */
    838            while(*nextSeparator == ' ') {
    839              nextSeparator++;
    840            }
    841            keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator);
    842            while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') {
    843              keyValueTail--;
    844            }
    845            /* Now copy the value, but check well-formedness */
    846            if (nextSeparator == keyValueTail) {
    847              status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */
    848              return;
    849            }
    850            while (nextSeparator < keyValueTail) {
    851              if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) {
    852                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
    853                return;
    854              }
    855              /* Should we lowercase value to return here? Tests expect as-is. */
    856              sink.Append(nextSeparator++, 1);
    857            }
    858            return;
    859        }
    860    }
    861 }
    862 
    863 U_CAPI int32_t U_EXPORT2
    864 uloc_setKeywordValue(const char* keywordName,
    865                     const char* keywordValue,
    866                     char* buffer, int32_t bufferCapacity,
    867                     UErrorCode* status)
    868 {
    869    if (U_FAILURE(*status)) { return 0; }
    870 
    871    if (keywordName == nullptr || *keywordName == 0) {
    872        *status = U_ILLEGAL_ARGUMENT_ERROR;
    873        return 0;
    874    }
    875 
    876    if (bufferCapacity <= 1) {
    877        *status = U_ILLEGAL_ARGUMENT_ERROR;
    878        return 0;
    879    }
    880 
    881    int32_t bufLen = (int32_t)uprv_strlen(buffer);
    882    if(bufferCapacity<bufLen) {
    883        /* The capacity is less than the length?! Is this NUL terminated? */
    884        *status = U_ILLEGAL_ARGUMENT_ERROR;
    885        return 0;
    886    }
    887 
    888    char* keywords = const_cast<char*>(
    889        locale_getKeywordsStart({buffer, static_cast<std::string_view::size_type>(bufLen)}));
    890    int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer;
    891    // Remove -1 from the capacity so that this function can guarantee NUL termination.
    892    CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords,
    893                              bufferCapacity - baseLen - 1);
    894    int32_t reslen = ulocimp_setKeywordValue(
    895        keywords == nullptr ? std::string_view() : keywords,
    896        keywordName,
    897        keywordValue == nullptr ? std::string_view() : keywordValue,
    898        sink,
    899        *status);
    900 
    901    if (U_FAILURE(*status)) {
    902        return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0;
    903    }
    904 
    905    // See the documentation for this function, it's guaranteed to never
    906    // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR.
    907    // In this case, nothing has been written to the sink, so it cannot have Overflowed().
    908    U_ASSERT(!sink.Overflowed());
    909    U_ASSERT(reslen >= 0);
    910    return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status);
    911 }
    912 
    913 U_EXPORT void
    914 ulocimp_setKeywordValue(std::string_view keywordName,
    915                        std::string_view keywordValue,
    916                        CharString& localeID,
    917                        UErrorCode& status)
    918 {
    919    if (U_FAILURE(status)) { return; }
    920    std::string_view keywords;
    921    if (const char* start = locale_getKeywordsStart(localeID.toStringPiece()); start != nullptr) {
    922        // This is safe because CharString::truncate() doesn't actually erase any
    923        // data, but simply sets the position for where new data will be written.
    924        int32_t size = start - localeID.data();
    925        keywords = localeID.toStringPiece();
    926        keywords.remove_prefix(size);
    927        localeID.truncate(size);
    928    }
    929    CharStringByteSink sink(&localeID);
    930    ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status);
    931 }
    932 
    933 U_EXPORT int32_t
    934 ulocimp_setKeywordValue(std::string_view keywords,
    935                        std::string_view keywordName,
    936                        std::string_view keywordValue,
    937                        ByteSink& sink,
    938                        UErrorCode& status)
    939 {
    940    if (U_FAILURE(status)) { return 0; }
    941 
    942    /* TODO: sorting. removal. */
    943    int32_t needLen = 0;
    944    int32_t rc;
    945    CharString updatedKeysAndValues;
    946    bool handledInputKeyAndValue = false;
    947    char keyValuePrefix = '@';
    948 
    949    if (status == U_STRING_NOT_TERMINATED_WARNING) {
    950        status = U_ZERO_ERROR;
    951    }
    952    if (keywordName.empty()) {
    953        status = U_ILLEGAL_ARGUMENT_ERROR;
    954        return 0;
    955    }
    956    CharString canonKeywordName = locale_canonKeywordName(keywordName, status);
    957    if (U_FAILURE(status)) {
    958        return 0;
    959    }
    960 
    961    CharString canonKeywordValue;
    962    for (char c : keywordValue) {
    963        if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) {
    964            status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */
    965            return 0;
    966        }
    967        /* Should we force lowercase in value to set? */
    968        canonKeywordValue.append(c, status);
    969    }
    970    if (U_FAILURE(status)) {
    971        return 0;
    972    }
    973 
    974    if (keywords.size() <= 1) {
    975        if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */
    976            U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
    977            return 0;
    978        }
    979 
    980        needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length();
    981        int32_t capacity = 0;
    982        char* buffer = sink.GetAppendBuffer(
    983                needLen, needLen, nullptr, needLen, &capacity);
    984        if (capacity < needLen || buffer == nullptr) {
    985            status = U_BUFFER_OVERFLOW_ERROR;
    986            return needLen; /* no change */
    987        }
    988        char* it = buffer;
    989 
    990        *it++ = '@';
    991        uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length());
    992        it += canonKeywordName.length();
    993        *it++ = '=';
    994        uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length());
    995        sink.Append(buffer, needLen);
    996        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
    997        return needLen;
    998    } /* end shortcut - no @ */
    999 
   1000    /* search for keyword */
   1001    for (size_t keywordStart = 0; keywordStart != std::string_view::npos;) {
   1002        keywordStart++; /* skip @ or ; */
   1003        size_t nextEqualsign = keywords.find('=', keywordStart);
   1004        if (nextEqualsign == std::string_view::npos) {
   1005            status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */
   1006            return 0;
   1007        }
   1008        /* strip leading & trailing spaces (TC decided to tolerate these) */
   1009        while (keywordStart < keywords.size() && keywords[keywordStart] == ' ') {
   1010            keywordStart++;
   1011        }
   1012        size_t keyValueTail = nextEqualsign;
   1013        while (keyValueTail > keywordStart && keywords[keyValueTail - 1] == ' ') {
   1014            keyValueTail--;
   1015        }
   1016        /* now keyValueTail points to first char after the keyName */
   1017        /* copy & normalize keyName from locale */
   1018        if (keywordStart == keyValueTail) {
   1019            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */
   1020            return 0;
   1021        }
   1022        CharString localeKeywordName;
   1023        while (keywordStart < keyValueTail) {
   1024            if (!UPRV_ISALPHANUM(keywords[keywordStart])) {
   1025                status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
   1026                return 0;
   1027            }
   1028            localeKeywordName.append(uprv_tolower(keywords[keywordStart++]), status);
   1029        }
   1030        if (U_FAILURE(status)) {
   1031            return 0;
   1032        }
   1033 
   1034        size_t nextSeparator = keywords.find(';', nextEqualsign);
   1035 
   1036        /* start processing the value part */
   1037        nextEqualsign++; /* skip '=' */
   1038        /* First strip leading & trailing spaces (TC decided to tolerate these) */
   1039        while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') {
   1040            nextEqualsign++;
   1041        }
   1042        keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator;
   1043        while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') {
   1044            keyValueTail--;
   1045        }
   1046        if (nextEqualsign == keyValueTail) {
   1047            status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */
   1048            return 0;
   1049        }
   1050 
   1051        rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data());
   1052        if(rc == 0) {
   1053            /* Current entry matches the input keyword. Update the entry */
   1054            if (!canonKeywordValue.isEmpty()) { /* updating a value */
   1055                updatedKeysAndValues.append(keyValuePrefix, status);
   1056                keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1057                updatedKeysAndValues.append(canonKeywordName, status);
   1058                updatedKeysAndValues.append('=', status);
   1059                updatedKeysAndValues.append(canonKeywordValue, status);
   1060            } /* else removing this entry, don't emit anything */
   1061            handledInputKeyAndValue = true;
   1062        } else {
   1063           /* input keyword sorts earlier than current entry, add before current entry */
   1064            if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
   1065                /* insert new entry at this location */
   1066                updatedKeysAndValues.append(keyValuePrefix, status);
   1067                keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1068                updatedKeysAndValues.append(canonKeywordName, status);
   1069                updatedKeysAndValues.append('=', status);
   1070                updatedKeysAndValues.append(canonKeywordValue, status);
   1071                handledInputKeyAndValue = true;
   1072            }
   1073            /* copy the current entry */
   1074            updatedKeysAndValues.append(keyValuePrefix, status);
   1075            keyValuePrefix = ';'; /* for any subsequent key-value pair */
   1076            updatedKeysAndValues.append(localeKeywordName, status);
   1077            updatedKeysAndValues.append('=', status);
   1078            updatedKeysAndValues.append(keywords.data() + nextEqualsign,
   1079                                        static_cast<int32_t>(keyValueTail - nextEqualsign), status);
   1080        }
   1081        if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) {
   1082            /* append new entry at the end, it sorts later than existing entries */
   1083            updatedKeysAndValues.append(keyValuePrefix, status);
   1084            /* skip keyValuePrefix update, no subsequent key-value pair */
   1085            updatedKeysAndValues.append(canonKeywordName, status);
   1086            updatedKeysAndValues.append('=', status);
   1087            updatedKeysAndValues.append(canonKeywordValue, status);
   1088            handledInputKeyAndValue = true;
   1089        }
   1090        keywordStart = nextSeparator;
   1091    } /* end loop searching */
   1092 
   1093    /* Any error from updatedKeysAndValues.append above would be internal and not due to
   1094     * problems with the passed-in locale. So if we did encounter problems with the
   1095     * passed-in locale above, those errors took precedence and overrode any error
   1096     * status from updatedKeysAndValues.append, and also caused a return of 0. If there
   1097     * are errors here they are from updatedKeysAndValues.append; they do cause an
   1098     * error return but the passed-in locale is unmodified and the original bufLen is
   1099     * returned.
   1100     */
   1101    if (!handledInputKeyAndValue || U_FAILURE(status)) {
   1102        /* if input key/value specified removal of a keyword not present in locale, or
   1103         * there was an error in CharString.append, leave original locale alone. */
   1104        U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   1105        // The sink is expected to be a buffer which already contains the full
   1106        // locale string, so when it isn't going to be modified there's no need
   1107        // to actually write any data to it, as the data is already there. Only
   1108        // the first character needs to be overwritten (changing '\0' to '@').
   1109        needLen = static_cast<int32_t>(keywords.size());
   1110        int32_t capacity = 0;
   1111        char* buffer = sink.GetAppendBuffer(
   1112                needLen, needLen, nullptr, needLen, &capacity);
   1113        if (capacity < needLen || buffer == nullptr) {
   1114            status = U_BUFFER_OVERFLOW_ERROR;
   1115        } else {
   1116            *buffer = '@';
   1117            sink.Append(buffer, needLen);
   1118        }
   1119        return needLen;
   1120    }
   1121 
   1122    needLen = updatedKeysAndValues.length();
   1123    // Check to see can we fit the updatedKeysAndValues, if not, return
   1124    // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it.
   1125    // We do this because this API function does not behave like most others:
   1126    // It promises never to set a U_STRING_NOT_TERMINATED_WARNING.
   1127    // When the contents fits but without the terminating NUL, in this case we need to not change
   1128    // the buffer contents and return with a buffer overflow error.
   1129    if (needLen > 0) {
   1130        int32_t capacity = 0;
   1131        char* buffer = sink.GetAppendBuffer(
   1132                needLen, needLen, nullptr, needLen, &capacity);
   1133        if (capacity < needLen || buffer == nullptr) {
   1134            status = U_BUFFER_OVERFLOW_ERROR;
   1135            return needLen;
   1136        }
   1137        uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen);
   1138        sink.Append(buffer, needLen);
   1139    }
   1140    U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   1141    return needLen;
   1142 }
   1143 
   1144 /* ### ID parsing implementation **************************************************/
   1145 
   1146 namespace {
   1147 
   1148 inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; }
   1149 
   1150 /*returns true if one of the special prefixes is here (s=string)
   1151  'x-' or 'i-' */
   1152 inline bool _isIDPrefix(std::string_view s) {
   1153    return s.size() >= 2 && _isPrefixLetter(s[0]) && _isIDSeparator(s[1]);
   1154 }
   1155 
   1156 /* Dot terminates it because of POSIX form  where dot precedes the codepage
   1157 * except for variant
   1158 */
   1159 inline bool _isTerminator(char a) { return a == '.' || a == '@'; }
   1160 
   1161 inline bool _isBCP47Extension(std::string_view p) {
   1162    return p.size() >= 3 &&
   1163           p[0] == '-' &&
   1164           (p[1] == 't' || p[1] == 'T' ||
   1165            p[1] == 'u' || p[1] == 'U' ||
   1166            p[1] == 'x' || p[1] == 'X') &&
   1167           p[2] == '-';
   1168 }
   1169 
   1170 /**
   1171 * Lookup 'key' in the array 'list'.  The array 'list' should contain
   1172 * a nullptr entry, followed by more entries, and a second nullptr entry.
   1173 *
   1174 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
   1175 * COUNTRIES_3.
   1176 */
   1177 std::optional<int16_t> _findIndex(const char* const* list, const char* key)
   1178 {
   1179    const char* const* anchor = list;
   1180    int32_t pass = 0;
   1181 
   1182    /* Make two passes through two nullptr-terminated arrays at 'list' */
   1183    while (pass++ < 2) {
   1184        while (*list) {
   1185            if (uprv_strcmp(key, *list) == 0) {
   1186                return static_cast<int16_t>(list - anchor);
   1187            }
   1188            list++;
   1189        }
   1190        ++list;     /* skip final nullptr *CWB*/
   1191    }
   1192    return std::nullopt;
   1193 }
   1194 
   1195 }  // namespace
   1196 
   1197 U_CFUNC const char*
   1198 uloc_getCurrentCountryID(const char* oldID){
   1199    std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
   1200    return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID;
   1201 }
   1202 U_CFUNC const char*
   1203 uloc_getCurrentLanguageID(const char* oldID){
   1204    std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
   1205    return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID;
   1206 }
   1207 
   1208 namespace {
   1209 
   1210 /*
   1211 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant()
   1212 * avoid duplicating code to handle the earlier locale ID pieces
   1213 * in the functions for the later ones by
   1214 * setting the *pEnd pointer to where they stopped parsing
   1215 *
   1216 * TODO try to use this in Locale
   1217 */
   1218 
   1219 size_t _getLanguage(std::string_view localeID, ByteSink* sink, UErrorCode& status) {
   1220    size_t skip = 0;
   1221    if (localeID.size() == 4 && uprv_strnicmp(localeID.data(), "root", 4) == 0) {
   1222        skip = 4;
   1223        localeID.remove_prefix(skip);
   1224    } else if (localeID.size() >= 3 && uprv_strnicmp(localeID.data(), "und", 3) == 0 &&
   1225               (localeID.size() == 3 ||
   1226                localeID[3] == '-' ||
   1227                localeID[3] == '_' ||
   1228                localeID[3] == '@')) {
   1229        skip = 3;
   1230        localeID.remove_prefix(skip);
   1231    }
   1232 
   1233    constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1;  // Minus NUL.
   1234 
   1235    /* if it starts with i- or x- then copy that prefix */
   1236    size_t len = _isIDPrefix(localeID) ? 2 : 0;
   1237    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
   1238        if (len == MAXLEN) {
   1239            status = U_ILLEGAL_ARGUMENT_ERROR;
   1240            return 0;
   1241        }
   1242        len++;
   1243    }
   1244 
   1245    if (sink == nullptr || len == 0) { return skip + len; }
   1246 
   1247    int32_t minCapacity = uprv_max(static_cast<int32_t>(len), 4);  // Minimum 3 letters plus NUL.
   1248    char scratch[MAXLEN];
   1249    int32_t capacity = 0;
   1250    char* buffer = sink->GetAppendBuffer(
   1251            minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
   1252 
   1253    for (size_t i = 0; i < len; ++i) {
   1254        buffer[i] = uprv_tolower(localeID[i]);
   1255    }
   1256    if (localeID.size() >= 2 && _isIDSeparator(localeID[1])) {
   1257        buffer[1] = '-';
   1258    }
   1259 
   1260    if (len == 3) {
   1261        /* convert 3 character code to 2 character code if possible *CWB*/
   1262        U_ASSERT(capacity >= 4);
   1263        buffer[3] = '\0';
   1264        std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer);
   1265        if (offset.has_value()) {
   1266            const char* const alias = LANGUAGES[*offset];
   1267            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
   1268            return skip + len;
   1269        }
   1270    }
   1271 
   1272    sink->Append(buffer, static_cast<int32_t>(len));
   1273    return skip + len;
   1274 }
   1275 
   1276 size_t _getScript(std::string_view localeID, ByteSink* sink) {
   1277    constexpr int32_t LENGTH = 4;
   1278 
   1279    size_t len = 0;
   1280    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) &&
   1281            uprv_isASCIILetter(localeID[len])) {
   1282        if (len == LENGTH) { return 0; }
   1283        len++;
   1284    }
   1285    if (len != LENGTH) { return 0; }
   1286 
   1287    if (sink == nullptr) { return len; }
   1288 
   1289    char scratch[LENGTH];
   1290    int32_t capacity = 0;
   1291    char* buffer = sink->GetAppendBuffer(
   1292            LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity);
   1293 
   1294    buffer[0] = uprv_toupper(localeID[0]);
   1295    for (int32_t i = 1; i < LENGTH; ++i) {
   1296        buffer[i] = uprv_tolower(localeID[i]);
   1297    }
   1298 
   1299    sink->Append(buffer, LENGTH);
   1300    return len;
   1301 }
   1302 
   1303 size_t _getRegion(std::string_view localeID, ByteSink* sink) {
   1304    constexpr int32_t MINLEN = 2;
   1305    constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1;  // Minus NUL.
   1306 
   1307    size_t len = 0;
   1308    while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) {
   1309        if (len == MAXLEN) { return 0; }
   1310        len++;
   1311    }
   1312    if (len < MINLEN) { return 0; }
   1313 
   1314    if (sink == nullptr) { return len; }
   1315 
   1316    char scratch[ULOC_COUNTRY_CAPACITY];
   1317    int32_t capacity = 0;
   1318    char* buffer = sink->GetAppendBuffer(
   1319            ULOC_COUNTRY_CAPACITY,
   1320            ULOC_COUNTRY_CAPACITY,
   1321            scratch,
   1322            UPRV_LENGTHOF(scratch),
   1323            &capacity);
   1324 
   1325    for (size_t i = 0; i < len; ++i) {
   1326        buffer[i] = uprv_toupper(localeID[i]);
   1327    }
   1328 
   1329    if (len == 3) {
   1330        /* convert 3 character code to 2 character code if possible *CWB*/
   1331        U_ASSERT(capacity >= 4);
   1332        buffer[3] = '\0';
   1333        std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer);
   1334        if (offset.has_value()) {
   1335            const char* const alias = COUNTRIES[*offset];
   1336            sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias)));
   1337            return len;
   1338        }
   1339    }
   1340 
   1341    sink->Append(buffer, static_cast<int32_t>(len));
   1342    return len;
   1343 }
   1344 
   1345 /**
   1346 * @param needSeparator if true, then add leading '_' if any variants
   1347 * are added to 'variant'
   1348 */
   1349 size_t
   1350 _getVariant(std::string_view localeID,
   1351            char prev,
   1352            ByteSink* sink,
   1353            bool needSeparator,
   1354            UErrorCode& status) {
   1355    if (U_FAILURE(status) || localeID.empty()) return 0;
   1356 
   1357    // Reasonable upper limit for variants
   1358    // There are no strict limitation of the syntax of variant in the legacy
   1359    // locale format. If the locale is constructed from unicode_locale_id
   1360    // as defined in UTS35, then we know each unicode_variant_subtag
   1361    // could have max length of 8 ((alphanum{5,8} | digit alphanum{3})
   1362    // 179 would allow 20 unicode_variant_subtag with sep in the
   1363    // unicode_locale_id
   1364    // 8*20 + 1*(20-1) = 179
   1365    constexpr int32_t MAX_VARIANTS_LENGTH = 179;
   1366 
   1367    /* get one or more variant tags and separate them with '_' */
   1368    size_t index = 0;
   1369    if (_isIDSeparator(prev)) {
   1370        /* get a variant string after a '-' or '_' */
   1371        for (std::string_view sub = localeID;;) {
   1372            size_t next = sub.find_first_of(".@_-");
   1373            // For historical reasons, a trailing separator is included in the variant.
   1374            bool finished = next == std::string_view::npos || next + 1 == sub.length();
   1375            size_t limit = finished ? sub.length() : next;
   1376            index += limit;
   1377            if (index > MAX_VARIANTS_LENGTH) {
   1378                status = U_ILLEGAL_ARGUMENT_ERROR;
   1379                return 0;
   1380            }
   1381 
   1382            if (sink != nullptr) {
   1383                if (needSeparator) {
   1384                    sink->Append("_", 1);
   1385                } else {
   1386                    needSeparator = true;
   1387                }
   1388 
   1389                int32_t length = static_cast<int32_t>(limit);
   1390                int32_t minCapacity = uprv_min(length, MAX_VARIANTS_LENGTH);
   1391                char scratch[MAX_VARIANTS_LENGTH];
   1392                int32_t capacity = 0;
   1393                char* buffer = sink->GetAppendBuffer(
   1394                        minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity);
   1395 
   1396                for (size_t i = 0; i < limit; ++i) {
   1397                    buffer[i] = uprv_toupper(sub[i]);
   1398                }
   1399                sink->Append(buffer, length);
   1400            }
   1401 
   1402            if (finished) { return index; }
   1403            sub.remove_prefix(next);
   1404            if (_isTerminator(sub.front()) || _isBCP47Extension(sub)) { return index; }
   1405            sub.remove_prefix(1);
   1406            index++;
   1407        }
   1408    }
   1409 
   1410    size_t skip = 0;
   1411    /* if there is no variant tag after a '-' or '_' then look for '@' */
   1412    if (prev == '@') {
   1413        /* keep localeID */
   1414    } else if (const char* p = locale_getKeywordsStart(localeID); p != nullptr) {
   1415        skip = 1 + p - localeID.data(); /* point after the '@' */
   1416        localeID.remove_prefix(skip);
   1417    } else {
   1418        return 0;
   1419    }
   1420    for (; index < localeID.size() && !_isTerminator(localeID[index]); index++) {
   1421        if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH
   1422            status = U_ILLEGAL_ARGUMENT_ERROR;
   1423            return 0;
   1424        }
   1425        if (needSeparator) {
   1426            if (sink != nullptr) {
   1427                sink->Append("_", 1);
   1428            }
   1429            needSeparator = false;
   1430        }
   1431        if (sink != nullptr) {
   1432            char c = uprv_toupper(localeID[index]);
   1433            if (c == '-' || c == ',') c = '_';
   1434            sink->Append(&c, 1);
   1435        }
   1436    }
   1437    return skip + index;
   1438 }
   1439 
   1440 }  // namespace
   1441 
   1442 U_EXPORT CharString
   1443 ulocimp_getLanguage(std::string_view localeID, UErrorCode& status) {
   1444    return ByteSinkUtil::viaByteSinkToCharString(
   1445        [&](ByteSink& sink, UErrorCode& status) {
   1446            ulocimp_getSubtags(
   1447                    localeID,
   1448                    &sink,
   1449                    nullptr,
   1450                    nullptr,
   1451                    nullptr,
   1452                    nullptr,
   1453                    status);
   1454        },
   1455        status);
   1456 }
   1457 
   1458 U_EXPORT CharString
   1459 ulocimp_getScript(std::string_view localeID, UErrorCode& status) {
   1460    return ByteSinkUtil::viaByteSinkToCharString(
   1461        [&](ByteSink& sink, UErrorCode& status) {
   1462            ulocimp_getSubtags(
   1463                    localeID,
   1464                    nullptr,
   1465                    &sink,
   1466                    nullptr,
   1467                    nullptr,
   1468                    nullptr,
   1469                    status);
   1470        },
   1471        status);
   1472 }
   1473 
   1474 U_EXPORT CharString
   1475 ulocimp_getRegion(std::string_view localeID, UErrorCode& status) {
   1476    return ByteSinkUtil::viaByteSinkToCharString(
   1477        [&](ByteSink& sink, UErrorCode& status) {
   1478            ulocimp_getSubtags(
   1479                    localeID,
   1480                    nullptr,
   1481                    nullptr,
   1482                    &sink,
   1483                    nullptr,
   1484                    nullptr,
   1485                    status);
   1486        },
   1487        status);
   1488 }
   1489 
   1490 U_EXPORT CharString
   1491 ulocimp_getVariant(std::string_view localeID, UErrorCode& status) {
   1492    return ByteSinkUtil::viaByteSinkToCharString(
   1493        [&](ByteSink& sink, UErrorCode& status) {
   1494            ulocimp_getSubtags(
   1495                    localeID,
   1496                    nullptr,
   1497                    nullptr,
   1498                    nullptr,
   1499                    &sink,
   1500                    nullptr,
   1501                    status);
   1502        },
   1503        status);
   1504 }
   1505 
   1506 U_EXPORT void
   1507 ulocimp_getSubtags(
   1508        std::string_view localeID,
   1509        CharString* language,
   1510        CharString* script,
   1511        CharString* region,
   1512        CharString* variant,
   1513        const char** pEnd,
   1514        UErrorCode& status) {
   1515    if (U_FAILURE(status)) { return; }
   1516 
   1517    std::optional<CharStringByteSink> languageSink;
   1518    std::optional<CharStringByteSink> scriptSink;
   1519    std::optional<CharStringByteSink> regionSink;
   1520    std::optional<CharStringByteSink> variantSink;
   1521 
   1522    if (language != nullptr) { languageSink.emplace(language); }
   1523    if (script != nullptr) { scriptSink.emplace(script); }
   1524    if (region != nullptr) { regionSink.emplace(region); }
   1525    if (variant != nullptr) { variantSink.emplace(variant); }
   1526 
   1527    ulocimp_getSubtags(
   1528            localeID,
   1529            languageSink.has_value() ? &*languageSink : nullptr,
   1530            scriptSink.has_value() ? &*scriptSink : nullptr,
   1531            regionSink.has_value() ? &*regionSink : nullptr,
   1532            variantSink.has_value() ? &*variantSink : nullptr,
   1533            pEnd,
   1534            status);
   1535 }
   1536 
   1537 U_EXPORT void
   1538 ulocimp_getSubtags(
   1539        std::string_view localeID,
   1540        ByteSink* language,
   1541        ByteSink* script,
   1542        ByteSink* region,
   1543        ByteSink* variant,
   1544        const char** pEnd,
   1545        UErrorCode& status) {
   1546    if (U_FAILURE(status)) { return; }
   1547 
   1548    if (pEnd != nullptr) {
   1549        *pEnd = localeID.data();
   1550    } else if (language == nullptr &&
   1551               script == nullptr &&
   1552               region == nullptr &&
   1553               variant == nullptr) {
   1554        return;
   1555    }
   1556 
   1557    if (localeID.empty()) { return; }
   1558 
   1559    bool hasRegion = false;
   1560 
   1561    {
   1562        size_t len = _getLanguage(localeID, language, status);
   1563        if (U_FAILURE(status)) { return; }
   1564        if (len > 0) {
   1565            localeID.remove_prefix(len);
   1566        }
   1567    }
   1568 
   1569    if (pEnd != nullptr) {
   1570        *pEnd = localeID.data();
   1571    } else if (script == nullptr &&
   1572               region == nullptr &&
   1573               variant == nullptr) {
   1574        return;
   1575    }
   1576 
   1577    if (localeID.empty()) { return; }
   1578 
   1579    if (_isIDSeparator(localeID.front())) {
   1580        std::string_view sub = localeID;
   1581        sub.remove_prefix(1);
   1582        size_t len = _getScript(sub, script);
   1583        if (len > 0) {
   1584            localeID.remove_prefix(len + 1);
   1585            if (pEnd != nullptr) { *pEnd = localeID.data(); }
   1586        }
   1587    }
   1588 
   1589    if ((region == nullptr && variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
   1590 
   1591    if (_isIDSeparator(localeID.front())) {
   1592        std::string_view sub = localeID;
   1593        sub.remove_prefix(1);
   1594        size_t len = _getRegion(sub, region);
   1595        if (len > 0) {
   1596            hasRegion = true;
   1597            localeID.remove_prefix(len + 1);
   1598            if (pEnd != nullptr) { *pEnd = localeID.data(); }
   1599        }
   1600    }
   1601 
   1602    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
   1603 
   1604    bool hasVariant = false;
   1605 
   1606    if (_isIDSeparator(localeID.front()) && !_isBCP47Extension(localeID)) {
   1607        std::string_view sub = localeID;
   1608        /* If there was no country ID, skip a possible extra IDSeparator */
   1609        size_t skip = !hasRegion && localeID.size() > 1 && _isIDSeparator(localeID[1]) ? 2 : 1;
   1610        sub.remove_prefix(skip);
   1611        size_t len = _getVariant(sub, localeID[0], variant, false, status);
   1612        if (U_FAILURE(status)) { return; }
   1613        if (len > 0) {
   1614            hasVariant = true;
   1615            localeID.remove_prefix(skip + len);
   1616            if (pEnd != nullptr) { *pEnd = localeID.data(); }
   1617        }
   1618    }
   1619 
   1620    if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; }
   1621 
   1622    if (_isBCP47Extension(localeID)) {
   1623        localeID.remove_prefix(2);
   1624        constexpr char vaposix[] = "-va-posix";
   1625        constexpr size_t length = sizeof vaposix - 1;
   1626        for (size_t next;; localeID.remove_prefix(next)) {
   1627            next = localeID.find('-', 1);
   1628            if (next == std::string_view::npos) { break; }
   1629            next = localeID.find('-', next + 1);
   1630            bool finished = next == std::string_view::npos;
   1631            std::string_view sub = localeID;
   1632            if (!finished) { sub.remove_suffix(sub.length() - next); }
   1633 
   1634            if (sub.length() == length && uprv_strnicmp(sub.data(), vaposix, length) == 0) {
   1635                if (variant != nullptr) {
   1636                    if (hasVariant) { variant->Append("_", 1); }
   1637                    constexpr char posix[] = "POSIX";
   1638                    variant->Append(posix, sizeof posix - 1);
   1639                }
   1640                if (pEnd != nullptr) { *pEnd = localeID.data() + length; }
   1641            }
   1642 
   1643            if (finished) { break; }
   1644        }
   1645    }
   1646 }
   1647 
   1648 /* Keyword enumeration */
   1649 
   1650 typedef struct UKeywordsContext {
   1651    char* keywords;
   1652    char* current;
   1653 } UKeywordsContext;
   1654 
   1655 U_CDECL_BEGIN
   1656 
   1657 static void U_CALLCONV
   1658 uloc_kw_closeKeywords(UEnumeration *enumerator) {
   1659    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
   1660    uprv_free(enumerator->context);
   1661    uprv_free(enumerator);
   1662 }
   1663 
   1664 static int32_t U_CALLCONV
   1665 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
   1666    char *kw = ((UKeywordsContext *)en->context)->keywords;
   1667    int32_t result = 0;
   1668    while(*kw) {
   1669        result++;
   1670        kw += uprv_strlen(kw)+1;
   1671    }
   1672    return result;
   1673 }
   1674 
   1675 static const char * U_CALLCONV
   1676 uloc_kw_nextKeyword(UEnumeration* en,
   1677                    int32_t* resultLength,
   1678                    UErrorCode* /*status*/) {
   1679    const char* result = ((UKeywordsContext *)en->context)->current;
   1680    int32_t len = 0;
   1681    if(*result) {
   1682        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
   1683        ((UKeywordsContext *)en->context)->current += len+1;
   1684    } else {
   1685        result = nullptr;
   1686    }
   1687    if (resultLength) {
   1688        *resultLength = len;
   1689    }
   1690    return result;
   1691 }
   1692 
   1693 static void U_CALLCONV
   1694 uloc_kw_resetKeywords(UEnumeration* en,
   1695                      UErrorCode* /*status*/) {
   1696    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
   1697 }
   1698 
   1699 U_CDECL_END
   1700 
   1701 
   1702 static const UEnumeration gKeywordsEnum = {
   1703    nullptr,
   1704    nullptr,
   1705    uloc_kw_closeKeywords,
   1706    uloc_kw_countKeywords,
   1707    uenum_unextDefault,
   1708    uloc_kw_nextKeyword,
   1709    uloc_kw_resetKeywords
   1710 };
   1711 
   1712 U_CAPI UEnumeration* U_EXPORT2
   1713 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
   1714 {
   1715    if (U_FAILURE(*status)) { return nullptr; }
   1716 
   1717    LocalMemory<UKeywordsContext> myContext;
   1718    LocalMemory<UEnumeration> result;
   1719 
   1720    myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext))));
   1721    result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration))));
   1722    if (myContext.isNull() || result.isNull()) {
   1723        *status = U_MEMORY_ALLOCATION_ERROR;
   1724        return nullptr;
   1725    }
   1726    uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration));
   1727    myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1));
   1728    if (myContext->keywords == nullptr) {
   1729        *status = U_MEMORY_ALLOCATION_ERROR;
   1730        return nullptr;
   1731    }
   1732    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
   1733    myContext->keywords[keywordListSize] = 0;
   1734    myContext->current = myContext->keywords;
   1735    result->context = myContext.orphan();
   1736    return result.orphan();
   1737 }
   1738 
   1739 U_CAPI UEnumeration* U_EXPORT2
   1740 uloc_openKeywords(const char* localeID,
   1741                        UErrorCode* status)
   1742 {
   1743    if(status==nullptr || U_FAILURE(*status)) {
   1744        return nullptr;
   1745    }
   1746 
   1747    CharString tempBuffer;
   1748    const char* tmpLocaleID;
   1749 
   1750    if (localeID != nullptr && _hasBCP47Extension(localeID)) {
   1751        tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status);
   1752        tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID;
   1753    } else {
   1754        if (localeID==nullptr) {
   1755            localeID=uloc_getDefault();
   1756        }
   1757        tmpLocaleID=localeID;
   1758    }
   1759 
   1760    ulocimp_getSubtags(
   1761            tmpLocaleID,
   1762            nullptr,
   1763            nullptr,
   1764            nullptr,
   1765            nullptr,
   1766            &tmpLocaleID,
   1767            *status);
   1768    if (U_FAILURE(*status)) {
   1769        return nullptr;
   1770    }
   1771 
   1772    /* keywords are located after '@' */
   1773    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) {
   1774        CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status);
   1775        if (U_FAILURE(*status)) {
   1776            return nullptr;
   1777        }
   1778        return uloc_openKeywordList(keywords.data(), keywords.length(), status);
   1779    }
   1780    return nullptr;
   1781 }
   1782 
   1783 
   1784 /* bit-flags for 'options' parameter of _canonicalize */
   1785 #define _ULOC_STRIP_KEYWORDS 0x2
   1786 #define _ULOC_CANONICALIZE   0x1
   1787 
   1788 namespace {
   1789 
   1790 inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; }
   1791 
   1792 constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
   1793 constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default);
   1794 
   1795 /**
   1796 * Canonicalize the given localeID, to level 1 or to level 2,
   1797 * depending on the options.  To specify level 1, pass in options=0.
   1798 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
   1799 *
   1800 * This is the code underlying uloc_getName and uloc_canonicalize.
   1801 */
   1802 void
   1803 _canonicalize(std::string_view localeID,
   1804              ByteSink& sink,
   1805              uint32_t options,
   1806              UErrorCode& err) {
   1807    if (U_FAILURE(err)) {
   1808        return;
   1809    }
   1810 
   1811    int32_t j, fieldCount=0;
   1812    CharString tempBuffer;  // if localeID has a BCP47 extension, tmpLocaleID points to this
   1813    CharString localeIDWithHyphens;  // if localeID has a BPC47 extension and have _, tmpLocaleID points to this
   1814    std::string_view origLocaleID;
   1815    std::string_view tmpLocaleID;
   1816    size_t keywordAssign = std::string_view::npos;
   1817    size_t separatorIndicator = std::string_view::npos;
   1818 
   1819    if (_hasBCP47Extension(localeID)) {
   1820        std::string_view localeIDPtr = localeID;
   1821 
   1822        // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string
   1823        if (localeID.size() >= 2 && localeID.find('_') != std::string_view::npos && localeID[1] != '-' && localeID[1] != '_') {
   1824            localeIDWithHyphens.append(localeID, err);
   1825            if (U_SUCCESS(err)) {
   1826                for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) {
   1827                    if (*p == '_') {
   1828                        *p = '-';
   1829                    }
   1830                }
   1831                localeIDPtr = localeIDWithHyphens.toStringPiece();
   1832            }
   1833        }
   1834 
   1835        tempBuffer = ulocimp_forLanguageTag(localeIDPtr.data(), static_cast<int32_t>(localeIDPtr.size()), nullptr, err);
   1836        tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? static_cast<std::string_view>(tempBuffer.toStringPiece()) : localeIDPtr;
   1837    } else {
   1838        tmpLocaleID=localeID;
   1839    }
   1840 
   1841    origLocaleID=tmpLocaleID;
   1842 
   1843    /* get all pieces, one after another, and separate with '_' */
   1844    CharString tag;
   1845    CharString script;
   1846    CharString country;
   1847    CharString variant;
   1848    const char* end = nullptr;
   1849    ulocimp_getSubtags(
   1850            tmpLocaleID,
   1851            &tag,
   1852            &script,
   1853            &country,
   1854            &variant,
   1855            &end,
   1856            err);
   1857    if (U_FAILURE(err)) {
   1858        return;
   1859    }
   1860    U_ASSERT(end != nullptr);
   1861    if (end > tmpLocaleID.data()) {
   1862        tmpLocaleID.remove_prefix(end - tmpLocaleID.data());
   1863    }
   1864 
   1865    if (tag.length() == I_DEFAULT_LENGTH && origLocaleID.length() >= I_DEFAULT_LENGTH &&
   1866            uprv_strncmp(origLocaleID.data(), i_default, I_DEFAULT_LENGTH) == 0) {
   1867        tag.clear();
   1868        tag.append(uloc_getDefault(), err);
   1869    } else {
   1870        if (!script.isEmpty()) {
   1871            ++fieldCount;
   1872            tag.append('_', err);
   1873            tag.append(script, err);
   1874        }
   1875        if (!country.isEmpty()) {
   1876            ++fieldCount;
   1877            tag.append('_', err);
   1878            tag.append(country, err);
   1879        }
   1880        if (!variant.isEmpty()) {
   1881            ++fieldCount;
   1882            if (country.isEmpty()) {
   1883                tag.append('_', err);
   1884            }
   1885            tag.append('_', err);
   1886            tag.append(variant, err);
   1887        }
   1888    }
   1889 
   1890    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
   1891    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && !tmpLocaleID.empty() && tmpLocaleID.front() == '.') {
   1892        tag.append('.', err);
   1893        tmpLocaleID.remove_prefix(1);
   1894        size_t length;
   1895        if (size_t atPos = tmpLocaleID.find('@'); atPos != std::string_view::npos) {
   1896            length = atPos;
   1897        } else {
   1898            length = tmpLocaleID.length();
   1899        }
   1900        // The longest charset name we found in IANA charset registry
   1901        // https://www.iana.org/assignments/character-sets/ is
   1902        // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45.
   1903        // we therefore restrict the length here to be 64 which is a power of 2
   1904        // number that is longer than 45.
   1905        constexpr size_t kMaxCharsetLength = 64;
   1906        if (length > kMaxCharsetLength) {
   1907           err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */
   1908           return;
   1909        }
   1910        if (length > 0) {
   1911            tag.append(tmpLocaleID.data(), static_cast<int32_t>(length), err);
   1912            tmpLocaleID.remove_prefix(length);
   1913        }
   1914    }
   1915 
   1916    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
   1917       After this, tmpLocaleID either starts at '@' or is empty. */
   1918    if (const char* start = locale_getKeywordsStart(tmpLocaleID); start != nullptr) {
   1919        if (start > tmpLocaleID.data()) {
   1920            tmpLocaleID.remove_prefix(start - tmpLocaleID.data());
   1921        }
   1922        keywordAssign = tmpLocaleID.find('=');
   1923        separatorIndicator = tmpLocaleID.find(';');
   1924    } else {
   1925        tmpLocaleID = {};
   1926    }
   1927 
   1928    /* Copy POSIX-style variant, if any [mr@FOO] */
   1929    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
   1930        !tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
   1931        tag.append(tmpLocaleID, err);
   1932        tmpLocaleID = {};
   1933    }
   1934 
   1935    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
   1936        /* Handle @FOO variant if @ is present and not followed by = */
   1937        if (!tmpLocaleID.empty() && keywordAssign == std::string_view::npos) {
   1938            /* Add missing '_' if needed */
   1939            if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) {
   1940                do {
   1941                    tag.append('_', err);
   1942                    ++fieldCount;
   1943                } while(fieldCount<2);
   1944            }
   1945 
   1946            CharStringByteSink s(&tag);
   1947            std::string_view sub = tmpLocaleID;
   1948            sub.remove_prefix(1);
   1949            _getVariant(sub, '@', &s, !variant.isEmpty(), err);
   1950            if (U_FAILURE(err)) { return; }
   1951        }
   1952 
   1953        /* Look up the ID in the canonicalization map */
   1954        for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
   1955            StringPiece id(CANONICALIZE_MAP[j].id);
   1956            if (tag == id) {
   1957                if (id.empty() && !tmpLocaleID.empty()) {
   1958                    break; /* Don't remap "" if keywords present */
   1959                }
   1960                tag.clear();
   1961                tag.append(CANONICALIZE_MAP[j].canonicalID, err);
   1962                break;
   1963            }
   1964        }
   1965    }
   1966 
   1967    sink.Append(tag.data(), tag.length());
   1968 
   1969    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
   1970        if (!tmpLocaleID.empty() && keywordAssign != std::string_view::npos &&
   1971            (separatorIndicator == std::string_view::npos || separatorIndicator > keywordAssign)) {
   1972            sink.Append("@", 1);
   1973            ++fieldCount;
   1974            tmpLocaleID.remove_prefix(1);
   1975            ulocimp_getKeywords(tmpLocaleID, '@', sink, true, err);
   1976        }
   1977    }
   1978 }
   1979 
   1980 }  // namespace
   1981 
   1982 /* ### ID parsing API **************************************************/
   1983 
   1984 U_CAPI int32_t  U_EXPORT2
   1985 uloc_getParent(const char*    localeID,
   1986               char* parent,
   1987               int32_t parentCapacity,
   1988               UErrorCode* err)
   1989 {
   1990    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   1991        parent, parentCapacity,
   1992        [&](ByteSink& sink, UErrorCode& status) {
   1993            ulocimp_getParent(localeID, sink, status);
   1994        },
   1995        *err);
   1996 }
   1997 
   1998 U_EXPORT CharString
   1999 ulocimp_getParent(const char* localeID,
   2000                  UErrorCode& err)
   2001 {
   2002    return ByteSinkUtil::viaByteSinkToCharString(
   2003        [&](ByteSink& sink, UErrorCode& status) {
   2004            ulocimp_getParent(localeID, sink, status);
   2005        },
   2006        err);
   2007 }
   2008 
   2009 U_EXPORT void
   2010 ulocimp_getParent(const char* localeID,
   2011                  icu::ByteSink& sink,
   2012                  UErrorCode& err)
   2013 {
   2014    if (U_FAILURE(err)) { return; }
   2015 
   2016    const char *lastUnderscore;
   2017    int32_t i;
   2018 
   2019    if (localeID == nullptr)
   2020        localeID = uloc_getDefault();
   2021 
   2022    lastUnderscore=uprv_strrchr(localeID, '_');
   2023    if(lastUnderscore!=nullptr) {
   2024        i = static_cast<int32_t>(lastUnderscore - localeID);
   2025    } else {
   2026        i=0;
   2027    }
   2028 
   2029    if (i > 0) {
   2030        if (uprv_strnicmp(localeID, "und_", 4) == 0) {
   2031            localeID += 3;
   2032            i -= 3;
   2033        }
   2034        sink.Append(localeID, i);
   2035    }
   2036 }
   2037 
   2038 U_CAPI int32_t U_EXPORT2
   2039 uloc_getLanguage(const char*    localeID,
   2040         char* language,
   2041         int32_t languageCapacity,
   2042         UErrorCode* err)
   2043 {
   2044    if (localeID == nullptr) {
   2045        localeID = uloc_getDefault();
   2046    }
   2047 
   2048    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
   2049    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2050        language, languageCapacity,
   2051        [&](ByteSink& sink, UErrorCode& status) {
   2052            ulocimp_getSubtags(
   2053                    localeID,
   2054                    &sink,
   2055                    nullptr,
   2056                    nullptr,
   2057                    nullptr,
   2058                    nullptr,
   2059                    status);
   2060        },
   2061        *err);
   2062 }
   2063 
   2064 U_CAPI int32_t U_EXPORT2
   2065 uloc_getScript(const char*    localeID,
   2066         char* script,
   2067         int32_t scriptCapacity,
   2068         UErrorCode* err)
   2069 {
   2070    if (localeID == nullptr) {
   2071        localeID = uloc_getDefault();
   2072    }
   2073 
   2074    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2075        script, scriptCapacity,
   2076        [&](ByteSink& sink, UErrorCode& status) {
   2077            ulocimp_getSubtags(
   2078                    localeID,
   2079                    nullptr,
   2080                    &sink,
   2081                    nullptr,
   2082                    nullptr,
   2083                    nullptr,
   2084                    status);
   2085        },
   2086        *err);
   2087 }
   2088 
   2089 U_CAPI int32_t  U_EXPORT2
   2090 uloc_getCountry(const char* localeID,
   2091            char* country,
   2092            int32_t countryCapacity,
   2093            UErrorCode* err)
   2094 {
   2095    if (localeID == nullptr) {
   2096        localeID = uloc_getDefault();
   2097    }
   2098 
   2099    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2100        country, countryCapacity,
   2101        [&](ByteSink& sink, UErrorCode& status) {
   2102            ulocimp_getSubtags(
   2103                    localeID,
   2104                    nullptr,
   2105                    nullptr,
   2106                    &sink,
   2107                    nullptr,
   2108                    nullptr,
   2109                    status);
   2110        },
   2111        *err);
   2112 }
   2113 
   2114 U_CAPI int32_t  U_EXPORT2
   2115 uloc_getVariant(const char* localeID,
   2116                char* variant,
   2117                int32_t variantCapacity,
   2118                UErrorCode* err)
   2119 {
   2120    if (localeID == nullptr) {
   2121        localeID = uloc_getDefault();
   2122    }
   2123 
   2124    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2125        variant, variantCapacity,
   2126        [&](ByteSink& sink, UErrorCode& status) {
   2127            ulocimp_getSubtags(
   2128                    localeID,
   2129                    nullptr,
   2130                    nullptr,
   2131                    nullptr,
   2132                    &sink,
   2133                    nullptr,
   2134                    status);
   2135        },
   2136        *err);
   2137 }
   2138 
   2139 U_CAPI int32_t  U_EXPORT2
   2140 uloc_getName(const char* localeID,
   2141             char* name,
   2142             int32_t nameCapacity,
   2143             UErrorCode* err)
   2144 {
   2145    if (localeID == nullptr) {
   2146        localeID = uloc_getDefault();
   2147    }
   2148    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2149        name, nameCapacity,
   2150        [&](ByteSink& sink, UErrorCode& status) {
   2151            ulocimp_getName(localeID, sink, status);
   2152        },
   2153        *err);
   2154 }
   2155 
   2156 U_EXPORT CharString
   2157 ulocimp_getName(std::string_view localeID,
   2158                UErrorCode& err)
   2159 {
   2160    return ByteSinkUtil::viaByteSinkToCharString(
   2161        [&](ByteSink& sink, UErrorCode& status) {
   2162            ulocimp_getName(localeID, sink, status);
   2163        },
   2164        err);
   2165 }
   2166 
   2167 U_EXPORT void
   2168 ulocimp_getName(std::string_view localeID,
   2169                ByteSink& sink,
   2170                UErrorCode& err)
   2171 {
   2172    _canonicalize(localeID, sink, 0, err);
   2173 }
   2174 
   2175 U_CAPI int32_t  U_EXPORT2
   2176 uloc_getBaseName(const char* localeID,
   2177                 char* name,
   2178                 int32_t nameCapacity,
   2179                 UErrorCode* err)
   2180 {
   2181    if (localeID == nullptr) {
   2182        localeID = uloc_getDefault();
   2183    }
   2184    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2185        name, nameCapacity,
   2186        [&](ByteSink& sink, UErrorCode& status) {
   2187            ulocimp_getBaseName(localeID, sink, status);
   2188        },
   2189        *err);
   2190 }
   2191 
   2192 U_EXPORT CharString
   2193 ulocimp_getBaseName(std::string_view localeID,
   2194                    UErrorCode& err)
   2195 {
   2196    return ByteSinkUtil::viaByteSinkToCharString(
   2197        [&](ByteSink& sink, UErrorCode& status) {
   2198            ulocimp_getBaseName(localeID, sink, status);
   2199        },
   2200        err);
   2201 }
   2202 
   2203 U_EXPORT void
   2204 ulocimp_getBaseName(std::string_view localeID,
   2205                    ByteSink& sink,
   2206                    UErrorCode& err)
   2207 {
   2208    _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err);
   2209 }
   2210 
   2211 U_CAPI int32_t  U_EXPORT2
   2212 uloc_canonicalize(const char* localeID,
   2213                  char* name,
   2214                  int32_t nameCapacity,
   2215                  UErrorCode* err)
   2216 {
   2217    if (localeID == nullptr) {
   2218        localeID = uloc_getDefault();
   2219    }
   2220    return ByteSinkUtil::viaByteSinkToTerminatedChars(
   2221        name, nameCapacity,
   2222        [&](ByteSink& sink, UErrorCode& status) {
   2223            ulocimp_canonicalize(localeID, sink, status);
   2224        },
   2225        *err);
   2226 }
   2227 
   2228 U_EXPORT CharString
   2229 ulocimp_canonicalize(std::string_view localeID,
   2230                     UErrorCode& err)
   2231 {
   2232    return ByteSinkUtil::viaByteSinkToCharString(
   2233        [&](ByteSink& sink, UErrorCode& status) {
   2234            ulocimp_canonicalize(localeID, sink, status);
   2235        },
   2236        err);
   2237 }
   2238 
   2239 U_EXPORT void
   2240 ulocimp_canonicalize(std::string_view localeID,
   2241                     ByteSink& sink,
   2242                     UErrorCode& err)
   2243 {
   2244    _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err);
   2245 }
   2246 
   2247 U_CAPI const char*  U_EXPORT2
   2248 uloc_getISO3Language(const char* localeID)
   2249 {
   2250    UErrorCode err = U_ZERO_ERROR;
   2251 
   2252    if (localeID == nullptr)
   2253    {
   2254        localeID = uloc_getDefault();
   2255    }
   2256    CharString lang = ulocimp_getLanguage(localeID, err);
   2257    if (U_FAILURE(err))
   2258        return "";
   2259    std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data());
   2260    return offset.has_value() ? LANGUAGES_3[*offset] : "";
   2261 }
   2262 
   2263 U_CAPI const char*  U_EXPORT2
   2264 uloc_getISO3Country(const char* localeID)
   2265 {
   2266    UErrorCode err = U_ZERO_ERROR;
   2267 
   2268    if (localeID == nullptr)
   2269    {
   2270        localeID = uloc_getDefault();
   2271    }
   2272    CharString cntry = ulocimp_getRegion(localeID, err);
   2273    if (U_FAILURE(err))
   2274        return "";
   2275    std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data());
   2276    return offset.has_value() ? COUNTRIES_3[*offset] : "";
   2277 }
   2278 
   2279 U_CAPI uint32_t  U_EXPORT2
   2280 uloc_getLCID(const char* localeID)
   2281 {
   2282    UErrorCode status = U_ZERO_ERROR;
   2283    uint32_t   lcid = 0;
   2284 
   2285    /* Check for incomplete id. */
   2286    if (!localeID || uprv_strlen(localeID) < 2) {
   2287        return 0;
   2288    }
   2289 
   2290    // First, attempt Windows platform lookup if available, but fall
   2291    // through to catch any special cases (ICU vs Windows name differences).
   2292    lcid = uprv_convertToLCIDPlatform(localeID, &status);
   2293    if (U_FAILURE(status)) {
   2294        return 0;
   2295    }
   2296    if (lcid > 0) {
   2297        // Windows found an LCID, return that
   2298        return lcid;
   2299    }
   2300 
   2301    CharString langID = ulocimp_getLanguage(localeID, status);
   2302    if (U_FAILURE(status)) {
   2303        return 0;
   2304    }
   2305 
   2306    if (uprv_strchr(localeID, '@')) {
   2307        // uprv_convertToLCID does not support keywords other than collation.
   2308        // Remove all keywords except collation.
   2309        CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status);
   2310        if (U_SUCCESS(status) && !collVal.isEmpty()) {
   2311            CharString tmpLocaleID = ulocimp_getBaseName(localeID, status);
   2312            ulocimp_setKeywordValue("collation", collVal.toStringPiece(), tmpLocaleID, status);
   2313            if (U_SUCCESS(status)) {
   2314                return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status);
   2315            }
   2316        }
   2317 
   2318        // fall through - all keywords are simply ignored
   2319        status = U_ZERO_ERROR;
   2320    }
   2321 
   2322    return uprv_convertToLCID(langID.data(), localeID, &status);
   2323 }
   2324 
   2325 U_CAPI int32_t U_EXPORT2
   2326 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
   2327                UErrorCode *status)
   2328 {
   2329    return uprv_convertToPosix(hostid, locale, localeCapacity, status);
   2330 }
   2331 
   2332 /* ### Default locale **************************************************/
   2333 
   2334 U_CAPI const char*  U_EXPORT2
   2335 uloc_getDefault()
   2336 {
   2337    return locale_get_default();
   2338 }
   2339 
   2340 U_CAPI void  U_EXPORT2
   2341 uloc_setDefault(const char*   newDefaultLocale,
   2342             UErrorCode* err)
   2343 {
   2344    if (U_FAILURE(*err))
   2345        return;
   2346    /* the error code isn't currently used for anything by this function*/
   2347 
   2348    /* propagate change to C++ */
   2349    locale_set_default(newDefaultLocale);
   2350 }
   2351 
   2352 /**
   2353 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
   2354 * to an array of pointers to arrays of char.  All of these pointers are owned
   2355 * by ICU-- do not delete them, and do not write through them.  The array is
   2356 * terminated with a null pointer.
   2357 */
   2358 U_CAPI const char* const*  U_EXPORT2
   2359 uloc_getISOLanguages()
   2360 {
   2361    return LANGUAGES;
   2362 }
   2363 
   2364 /**
   2365 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
   2366 * pointer to an array of pointers to arrays of char.  All of these pointers are
   2367 * owned by ICU-- do not delete them, and do not write through them.  The array is
   2368 * terminated with a null pointer.
   2369 */
   2370 U_CAPI const char* const*  U_EXPORT2
   2371 uloc_getISOCountries()
   2372 {
   2373    return COUNTRIES;
   2374 }
   2375 
   2376 U_CAPI const char* U_EXPORT2
   2377 uloc_toUnicodeLocaleKey(const char* keyword)
   2378 {
   2379    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
   2380    std::optional<std::string_view> result = ulocimp_toBcpKeyWithFallback(keyword);
   2381    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
   2382 }
   2383 
   2384 U_EXPORT std::optional<std::string_view>
   2385 ulocimp_toBcpKeyWithFallback(std::string_view keyword)
   2386 {
   2387    std::optional<std::string_view> bcpKey = ulocimp_toBcpKey(keyword);
   2388    if (!bcpKey.has_value() &&
   2389        ultag_isUnicodeLocaleKey(keyword.data(), static_cast<int32_t>(keyword.size()))) {
   2390        // unknown keyword, but syntax is fine..
   2391        return keyword;
   2392    }
   2393    return bcpKey;
   2394 }
   2395 
   2396 U_CAPI const char* U_EXPORT2
   2397 uloc_toUnicodeLocaleType(const char* keyword, const char* value)
   2398 {
   2399    if (keyword == nullptr || *keyword == '\0' ||
   2400        value == nullptr || *value == '\0') { return nullptr; }
   2401    std::optional<std::string_view> result = ulocimp_toBcpTypeWithFallback(keyword, value);
   2402    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
   2403 }
   2404 
   2405 U_EXPORT std::optional<std::string_view>
   2406 ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value)
   2407 {
   2408    std::optional<std::string_view> bcpType = ulocimp_toBcpType(keyword, value);
   2409    if (!bcpType.has_value() &&
   2410        ultag_isUnicodeLocaleType(value.data(), static_cast<int32_t>(value.size()))) {
   2411        // unknown keyword, but syntax is fine..
   2412        return value;
   2413    }
   2414    return bcpType;
   2415 }
   2416 
   2417 namespace {
   2418 
   2419 bool
   2420 isWellFormedLegacyKey(std::string_view key)
   2421 {
   2422    return std::all_of(key.begin(), key.end(), UPRV_ISALPHANUM);
   2423 }
   2424 
   2425 bool
   2426 isWellFormedLegacyType(std::string_view legacyType)
   2427 {
   2428    int32_t alphaNumLen = 0;
   2429    for (char c : legacyType) {
   2430        if (c == '_' || c == '/' || c == '-') {
   2431            if (alphaNumLen == 0) {
   2432                return false;
   2433            }
   2434            alphaNumLen = 0;
   2435        } else if (UPRV_ISALPHANUM(c)) {
   2436            alphaNumLen++;
   2437        } else {
   2438            return false;
   2439        }
   2440    }
   2441    return alphaNumLen != 0;
   2442 }
   2443 
   2444 }  // namespace
   2445 
   2446 U_CAPI const char* U_EXPORT2
   2447 uloc_toLegacyKey(const char* keyword)
   2448 {
   2449    if (keyword == nullptr || *keyword == '\0') { return nullptr; }
   2450    std::optional<std::string_view> result = ulocimp_toLegacyKeyWithFallback(keyword);
   2451    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
   2452 }
   2453 
   2454 U_EXPORT std::optional<std::string_view>
   2455 ulocimp_toLegacyKeyWithFallback(std::string_view keyword)
   2456 {
   2457    std::optional<std::string_view> legacyKey = ulocimp_toLegacyKey(keyword);
   2458    if (!legacyKey.has_value() && isWellFormedLegacyKey(keyword)) {
   2459        // Checks if the specified locale key is well-formed with the legacy locale syntax.
   2460        //
   2461        // Note:
   2462        //  LDML/CLDR provides some definition of keyword syntax in
   2463        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
   2464        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
   2465        //  Keys can only consist of [0-9a-zA-Z].
   2466        return keyword;
   2467    }
   2468    return legacyKey;
   2469 }
   2470 
   2471 U_CAPI const char* U_EXPORT2
   2472 uloc_toLegacyType(const char* keyword, const char* value)
   2473 {
   2474    if (keyword == nullptr || *keyword == '\0' ||
   2475        value == nullptr || *value == '\0') { return nullptr; }
   2476    std::optional<std::string_view> result = ulocimp_toLegacyTypeWithFallback(keyword, value);
   2477    return result.has_value() ? result->data() : nullptr;  // Known to be NUL terminated.
   2478 }
   2479 
   2480 U_EXPORT std::optional<std::string_view>
   2481 ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value)
   2482 {
   2483    std::optional<std::string_view> legacyType = ulocimp_toLegacyType(keyword, value);
   2484    if (!legacyType.has_value() && isWellFormedLegacyType(value)) {
   2485        // Checks if the specified locale type is well-formed with the legacy locale syntax.
   2486        //
   2487        // Note:
   2488        //  LDML/CLDR provides some definition of keyword syntax in
   2489        //  * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and
   2490        //  * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax
   2491        //  Values (types) can only consist of [0-9a-zA-Z], plus for legacy values
   2492        //  we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv")
   2493        return value;
   2494    }
   2495    return legacyType;
   2496 }
   2497 
   2498 /*eof*/
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE