uloc.cpp (90790B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1997-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * 9 * File ULOC.CPP 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 04/01/97 aliu Creation. 15 * 08/21/98 stephen JDK 1.2 sync 16 * 12/08/98 rtg New Locale implementation and C API 17 * 03/15/99 damiba overhaul. 18 * 04/06/99 stephen changed setDefault() to realloc and copy 19 * 06/14/99 stephen Changed calls to ures_open for new params 20 * 07/21/99 stephen Modified setDefault() to propagate to C++ 21 * 05/14/04 alan 7 years later: refactored, cleaned up, fixed bugs, 22 * brought canonicalization code into line with spec 23 *****************************************************************************/ 24 25 /* 26 POSIX's locale format, from putil.c: [no spaces] 27 28 ll [ _CC ] [ . MM ] [ @ VV] 29 30 l = lang, C = ctry, M = charmap, V = variant 31 */ 32 33 #include <algorithm> 34 #include <optional> 35 #include <string_view> 36 37 #include "unicode/bytestream.h" 38 #include "unicode/errorcode.h" 39 #include "unicode/stringpiece.h" 40 #include "unicode/utypes.h" 41 #include "unicode/ustring.h" 42 #include "unicode/uloc.h" 43 44 #include "bytesinkutil.h" 45 #include "putilimp.h" 46 #include "ustr_imp.h" 47 #include "ulocimp.h" 48 #include "umutex.h" 49 #include "cstring.h" 50 #include "cmemory.h" 51 #include "locmap.h" 52 #include "uarrsort.h" 53 #include "uenumimp.h" 54 #include "uassert.h" 55 #include "charstr.h" 56 57 U_NAMESPACE_USE 58 59 /* ### Declarations **************************************************/ 60 61 /* Locale stuff from locid.cpp */ 62 U_CFUNC void locale_set_default(const char *id); 63 U_CFUNC const char *locale_get_default(); 64 65 namespace { 66 67 /* ### Data tables **************************************************/ 68 69 /** 70 * Table of language codes, both 2- and 3-letter, with preference 71 * given to 2-letter codes where possible. Includes 3-letter codes 72 * that lack a 2-letter equivalent. 73 * 74 * This list must be in sorted order. This list is returned directly 75 * to the user by some API. 76 * 77 * This list must be kept in sync with LANGUAGES_3, with corresponding 78 * entries matched. 79 * 80 * This table should be terminated with a nullptr entry, followed by a 81 * second list, and another nullptr entry. The first list is visible to 82 * user code when this array is returned by API. The second list 83 * contains codes we support, but do not expose through user API. 84 * 85 * Notes 86 * 87 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to 88 * include the revisions up to 2001/7/27 *CWB* 89 * 90 * The 3 character codes are the terminology codes like RFC 3066. This 91 * is compatible with prior ICU codes 92 * 93 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the 94 * table but now at the end of the table because 3 character codes are 95 * duplicates. This avoids bad searches going from 3 to 2 character 96 * codes. 97 * 98 * The range qaa-qtz is reserved for local use 99 */ 100 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */ 101 /* ISO639 table version is 20150505 */ 102 /* Subsequent hand addition of selected languages */ 103 constexpr const char* LANGUAGES[] = { 104 "aa", "ab", "ace", "ach", "ada", "ady", "ae", "aeb", 105 "af", "afh", "agq", "ain", "ak", "akk", "akz", "ale", 106 "aln", "alt", "am", "an", "ang", "anp", "ar", "arc", 107 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "as", 108 "asa", "ase", "ast", "av", "avk", "awa", "ay", "az", 109 "ba", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj", 110 "be", "bej", "bem", "bew", "bez", "bfd", "bfq", "bg", 111 "bgc", "bgn", "bho", "bi", "bik", "bin", "bjn", "bkm", "bla", 112 "blo", "bm", "bn", "bo", "bpy", "bqi", "br", "bra", "brh", 113 "brx", "bs", "bss", "bua", "bug", "bum", "byn", "byv", 114 "ca", "cad", "car", "cay", "cch", "ccp", "ce", "ceb", "cgg", 115 "ch", "chb", "chg", "chk", "chm", "chn", "cho", "chp", 116 "chr", "chy", "ckb", "co", "cop", "cps", "cr", "crh", 117 "cs", "csb", "csw", "cu", "cv", "cy", 118 "da", "dak", "dar", "dav", "de", "del", "den", "dgr", 119 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv", 120 "dyo", "dyu", "dz", "dzg", 121 "ebu", "ee", "efi", "egl", "egy", "eka", "el", "elx", 122 "en", "enm", "eo", "es", "esu", "et", "eu", "ewo", 123 "ext", 124 "fa", "fan", "fat", "ff", "fi", "fil", "fit", "fj", 125 "fo", "fon", "fr", "frc", "frm", "fro", "frp", "frr", 126 "frs", "fur", "fy", 127 "ga", "gaa", "gag", "gan", "gay", "gba", "gbz", "gd", 128 "gez", "gil", "gl", "glk", "gmh", "gn", "goh", "gom", 129 "gon", "gor", "got", "grb", "grc", "gsw", "gu", "guc", 130 "gur", "guz", "gv", "gwi", 131 "ha", "hai", "hak", "haw", "he", "hi", "hif", "hil", 132 "hit", "hmn", "ho", "hr", "hsb", "hsn", "ht", "hu", 133 "hup", "hy", "hz", 134 "ia", "iba", "ibb", "id", "ie", "ig", "ii", "ik", 135 "ilo", "inh", "io", "is", "it", "iu", "izh", 136 "ja", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut", 137 "jv", 138 "ka", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd", 139 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg", "kgp", 140 "kha", "kho", "khq", "khw", "ki", "kiu", "kj", "kk", 141 "kkj", "kl", "kln", "km", "kmb", "kn", "ko", "koi", 142 "kok", "kos", "kpe", "kr", "krc", "kri", "krj", "krl", 143 "kru", "ks", "ksb", "ksf", "ksh", "ku", "kum", "kut", 144 "kv", "kw", "kxv", "ky", 145 "la", "lad", "lag", "lah", "lam", "lb", "lez", "lfn", 146 "lg", "li", "lij", "liv", "lkt", "lmo", "ln", "lo", 147 "lol", "loz", "lrc", "lt", "ltg", "lu", "lua", "lui", 148 "lun", "luo", "lus", "luy", "lv", "lzh", "lzz", 149 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde", 150 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg", "mga", 151 "mgh", "mgo", "mh", "mi", "mic", "min", "mis", "mk", 152 "ml", "mn", "mnc", "mni", 153 "moh", "mos", "mr", "mrj", 154 "ms", "mt", "mua", "mul", "mus", "mwl", "mwr", "mwv", 155 "my", "mye", "myv", "mzn", 156 "na", "nan", "nap", "naq", "nb", "nd", "nds", "ne", 157 "new", "ng", "nia", "niu", "njo", "nl", "nmg", "nn", 158 "nnh", "no", "nog", "non", "nov", "nqo", "nr", "nso", 159 "nus", "nv", "nwc", "ny", "nym", "nyn", "nyo", "nzi", 160 "oc", "oj", "om", "or", "os", "osa", "ota", 161 "pa", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc", 162 "pdt", "peo", "pfl", "phn", "pi", "pl", "pms", "pnt", 163 "pon", "prg", "pro", "ps", "pt", 164 "qu", "quc", "qug", 165 "raj", "rap", "rar", "rgn", "rif", "rm", "rn", "ro", 166 "rof", "rom", "rtm", "ru", "rue", "rug", "rup", 167 "rw", "rwk", 168 "sa", "sad", "sah", "sam", "saq", "sas", "sat", "saz", 169 "sba", "sbp", "sc", "scn", "sco", "sd", "sdc", "sdh", 170 "se", "see", "seh", "sei", "sel", "ses", "sg", "sga", 171 "sgs", "shi", "shn", "shu", "si", "sid", "sk", 172 "sl", "sli", "sly", "sm", "sma", "smj", "smn", "sms", 173 "sn", "snk", "so", "sog", "sq", "sr", "srn", "srr", 174 "ss", "ssy", "st", "stq", "su", "suk", "sus", "sux", 175 "sv", "sw", "swb", "syc", "syr", "szl", 176 "ta", "tcy", "te", "tem", "teo", "ter", "tet", "tg", 177 "th", "ti", "tig", "tiv", "tk", "tkl", "tkr", 178 "tlh", "tli", "tly", "tmh", "tn", "to", "tog", "tok", "tpi", 179 "tr", "tru", "trv", "ts", "tsd", "tsi", "tt", "ttt", 180 "tum", "tvl", "tw", "twq", "ty", "tyv", "tzm", 181 "udm", "ug", "uga", "uk", "umb", "und", "ur", "uz", 182 "vai", "ve", "vec", "vep", "vi", "vls", "vmf", "vmw", 183 "vo", "vot", "vro", "vun", 184 "wa", "wae", "wal", "war", "was", "wbp", "wo", "wuu", 185 "xal", "xh", "xmf", "xnr", "xog", 186 "yao", "yap", "yav", "ybb", "yi", "yo", "yrl", "yue", 187 "za", "zap", "zbl", "zea", "zen", "zgh", "zh", "zu", 188 "zun", "zxx", "zza", 189 nullptr, 190 "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", /* obsolete language codes */ 191 nullptr 192 }; 193 194 constexpr const char* DEPRECATED_LANGUAGES[]={ 195 "in", "iw", "ji", "jw", "mo", nullptr, nullptr 196 }; 197 constexpr const char* REPLACEMENT_LANGUAGES[]={ 198 "id", "he", "yi", "jv", "ro", nullptr, nullptr 199 }; 200 201 /** 202 * Table of 3-letter language codes. 203 * 204 * This is a lookup table used to convert 3-letter language codes to 205 * their 2-letter equivalent, where possible. It must be kept in sync 206 * with LANGUAGES. For all valid i, LANGUAGES[i] must refer to the 207 * same language as LANGUAGES_3[i]. The commented-out lines are 208 * copied from LANGUAGES to make eyeballing this baby easier. 209 * 210 * Where a 3-letter language code has no 2-letter equivalent, the 211 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i]. 212 * 213 * This table should be terminated with a nullptr entry, followed by a 214 * second list, and another nullptr entry. The two lists correspond to 215 * the two lists in LANGUAGES. 216 */ 217 /* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */ 218 /* ISO639 table version is 20150505 */ 219 /* Subsequent hand addition of selected languages */ 220 constexpr const char* LANGUAGES_3[] = { 221 "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb", 222 "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale", 223 "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc", 224 "arn", "aro", "arp", "arq", "ars", "arw", "ary", "arz", "asm", 225 "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze", 226 "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj", 227 "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul", 228 "bgc", "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla", 229 "blo", "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh", 230 "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv", 231 "cat", "cad", "car", "cay", "cch", "ccp", "che", "ceb", "cgg", 232 "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp", 233 "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh", 234 "ces", "csb", "csw", "chu", "chv", "cym", 235 "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr", 236 "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div", 237 "dyo", "dyu", "dzo", "dzg", 238 "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx", 239 "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo", 240 "ext", 241 "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij", 242 "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr", 243 "frs", "fur", "fry", 244 "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla", 245 "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom", 246 "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc", 247 "gur", "guz", "glv", "gwi", 248 "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil", 249 "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun", 250 "hup", "hye", "her", 251 "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk", 252 "ilo", "inh", "ido", "isl", "ita", "iku", "izh", 253 "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut", 254 "jav", 255 "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd", 256 "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp", 257 "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz", 258 "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi", 259 "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl", 260 "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut", 261 "kom", "cor", "kxv", "kir", 262 "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn", 263 "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao", 264 "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui", 265 "lun", "luo", "lus", "luy", "lav", "lzh", "lzz", 266 "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde", 267 "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga", 268 "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd", 269 "mal", "mon", "mnc", "mni", 270 "moh", "mos", "mar", "mrj", 271 "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv", 272 "mya", "mye", "myv", "mzn", 273 "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep", 274 "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno", 275 "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso", 276 "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", 277 "oci", "oji", "orm", "ori", "oss", "osa", "ota", 278 "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pcm", "pdc", 279 "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt", 280 "pon", "prg", "pro", "pus", "por", 281 "que", "quc", "qug", 282 "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron", 283 "rof", "rom", "rtm", "rus", "rue", "rug", "rup", 284 "kin", "rwk", 285 "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz", 286 "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh", 287 "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga", 288 "sgs", "shi", "shn", "shu", "sin", "sid", "slk", 289 "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms", 290 "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr", 291 "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux", 292 "swe", "swa", "swb", "syc", "syr", "szl", 293 "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk", 294 "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", 295 "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tok", "tpi", 296 "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt", 297 "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm", 298 "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb", 299 "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vmw", 300 "vol", "vot", "vro", "vun", 301 "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu", 302 "xal", "xho", "xmf", "xnr", "xog", 303 "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue", 304 "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul", 305 "zun", "zxx", "zza", 306 nullptr, 307 /* "in", "iw", "ji", "jw", "mo", "sh", "swc", "tl", */ 308 "ind", "heb", "yid", "jaw", "mol", "srp", "swc", "tgl", 309 nullptr 310 }; 311 312 /** 313 * Table of 2-letter country codes. 314 * 315 * This list must be in sorted order. This list is returned directly 316 * to the user by some API. 317 * 318 * This list must be kept in sync with COUNTRIES_3, with corresponding 319 * entries matched. 320 * 321 * This table should be terminated with a nullptr entry, followed by a 322 * second list, and another nullptr entry. The first list is visible to 323 * user code when this array is returned by API. The second list 324 * contains codes we support, but do not expose through user API. 325 * 326 * Notes: 327 * 328 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per 329 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added 330 * new codes keeping the old ones for compatibility updated to include 331 * 1999/12/03 revisions *CWB* 332 * 333 * RO(ROM) is now RO(ROU) according to 334 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html 335 */ 336 constexpr const char* COUNTRIES[] = { 337 "AD", "AE", "AF", "AG", "AI", "AL", "AM", 338 "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", 339 "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", 340 "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", 341 "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", 342 "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", 343 "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", 344 "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", 345 "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", 346 "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", 347 "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", 348 "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", 349 "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS", 350 "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", 351 "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", 352 "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", 353 "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", 354 "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", 355 "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", 356 "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", 357 "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", 358 "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", 359 "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", 360 "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", 361 "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", 362 "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", 363 "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", 364 "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", 365 "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", 366 "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", 367 nullptr, 368 "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR", /* obsolete country codes */ 369 nullptr 370 }; 371 372 constexpr const char* DEPRECATED_COUNTRIES[] = { 373 "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", nullptr, nullptr /* deprecated country list */ 374 }; 375 constexpr const char* REPLACEMENT_COUNTRIES[] = { 376 /* "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */ 377 "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", nullptr, nullptr /* replacement country codes */ 378 }; 379 380 /** 381 * Table of 3-letter country codes. 382 * 383 * This is a lookup table used to convert 3-letter country codes to 384 * their 2-letter equivalent. It must be kept in sync with COUNTRIES. 385 * For all valid i, COUNTRIES[i] must refer to the same country as 386 * COUNTRIES_3[i]. The commented-out lines are copied from COUNTRIES 387 * to make eyeballing this baby easier. 388 * 389 * This table should be terminated with a nullptr entry, followed by a 390 * second list, and another nullptr entry. The two lists correspond to 391 * the two lists in COUNTRIES. 392 */ 393 constexpr const char* COUNTRIES_3[] = { 394 /* "AD", "AE", "AF", "AG", "AI", "AL", "AM", */ 395 "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", 396 /* "AO", "AQ", "AR", "AS", "AT", "AU", "AW", "AX", "AZ", */ 397 "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE", 398 /* "BA", "BB", "BD", "BE", "BF", "BG", "BH", "BI", */ 399 "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI", 400 /* "BJ", "BL", "BM", "BN", "BO", "BQ", "BR", "BS", "BT", "BV", */ 401 "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT", 402 /* "BW", "BY", "BZ", "CA", "CC", "CD", "CF", "CG", */ 403 "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG", 404 /* "CH", "CI", "CK", "CL", "CM", "CN", "CO", "CQ", "CR", */ 405 "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRQ", "CRI", 406 /* "CU", "CV", "CW", "CX", "CY", "CZ", "DE", "DG", "DJ", "DK", */ 407 "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DGA", "DJI", "DNK", 408 /* "DM", "DO", "DZ", "EA", "EC", "EE", "EG", "EH", "ER", */ 409 "DMA", "DOM", "DZA", "XEA", "ECU", "EST", "EGY", "ESH", "ERI", 410 /* "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", */ 411 "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA", 412 /* "GA", "GB", "GD", "GE", "GF", "GG", "GH", "GI", "GL", */ 413 "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL", 414 /* "GM", "GN", "GP", "GQ", "GR", "GS", "GT", "GU", */ 415 "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM", 416 /* "GW", "GY", "HK", "HM", "HN", "HR", "HT", "HU", */ 417 "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN", 418 /* "IC", "ID", "IE", "IL", "IM", "IN", "IO", "IQ", "IR", "IS" */ 419 "XIC", "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL", 420 /* "IT", "JE", "JM", "JO", "JP", "KE", "KG", "KH", "KI", */ 421 "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR", 422 /* "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", */ 423 "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO", 424 /* "LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", */ 425 "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX", 426 /* "LV", "LY", "MA", "MC", "MD", "ME", "MF", "MG", "MH", "MK", */ 427 "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD", 428 /* "ML", "MM", "MN", "MO", "MP", "MQ", "MR", "MS", */ 429 "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR", 430 /* "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", */ 431 "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM", 432 /* "NC", "NE", "NF", "NG", "NI", "NL", "NO", "NP", */ 433 "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL", 434 /* "NR", "NU", "NZ", "OM", "PA", "PE", "PF", "PG", */ 435 "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG", 436 /* "PH", "PK", "PL", "PM", "PN", "PR", "PS", "PT", */ 437 "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT", 438 /* "PW", "PY", "QA", "RE", "RO", "RS", "RU", "RW", "SA", */ 439 "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU", 440 /* "SB", "SC", "SD", "SE", "SG", "SH", "SI", "SJ", */ 441 "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM", 442 /* "SK", "SL", "SM", "SN", "SO", "SR", "SS", "ST", "SV", */ 443 "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV", 444 /* "SX", "SY", "SZ", "TC", "TD", "TF", "TG", "TH", "TJ", */ 445 "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK", 446 /* "TK", "TL", "TM", "TN", "TO", "TR", "TT", "TV", */ 447 "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV", 448 /* "TW", "TZ", "UA", "UG", "UM", "US", "UY", "UZ", */ 449 "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB", 450 /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */ 451 "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF", 452 /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */ 453 "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", 454 nullptr, 455 /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */ 456 "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR", 457 nullptr 458 }; 459 460 typedef struct CanonicalizationMap { 461 const char *id; /* input ID */ 462 const char *canonicalID; /* canonicalized output ID */ 463 } CanonicalizationMap; 464 465 /** 466 * A map to canonicalize locale IDs. This handles a variety of 467 * different semantic kinds of transformations. 468 */ 469 constexpr CanonicalizationMap CANONICALIZE_MAP[] = { 470 { "art__LOJBAN", "jbo" }, /* registered name */ 471 { "hy__AREVELA", "hy" }, /* Registered IANA variant */ 472 { "hy__AREVMDA", "hyw" }, /* Registered IANA variant */ 473 { "zh__GUOYU", "zh" }, /* registered name */ 474 { "zh__HAKKA", "hak" }, /* registered name */ 475 { "zh__XIANG", "hsn" }, /* registered name */ 476 // subtags with 3 chars won't be treated as variants. 477 { "zh_GAN", "gan" }, /* registered name */ 478 { "zh_MIN_NAN", "nan" }, /* registered name */ 479 { "zh_WUU", "wuu" }, /* registered name */ 480 { "zh_YUE", "yue" }, /* registered name */ 481 }; 482 483 /* ### BCP47 Conversion *******************************************/ 484 /* Gets the size of the shortest subtag in the given localeID. */ 485 int32_t getShortestSubtagLength(std::string_view localeID) { 486 int32_t localeIDLength = static_cast<int32_t>(localeID.length()); 487 int32_t length = localeIDLength; 488 int32_t tmpLength = 0; 489 int32_t i; 490 bool reset = true; 491 492 for (i = 0; i < localeIDLength; i++) { 493 if (localeID[i] != '_' && localeID[i] != '-') { 494 if (reset) { 495 tmpLength = 0; 496 reset = false; 497 } 498 tmpLength++; 499 } else { 500 if (tmpLength != 0 && tmpLength < length) { 501 length = tmpLength; 502 } 503 reset = true; 504 } 505 } 506 507 return length; 508 } 509 /* Test if the locale id has BCP47 u extension and does not have '@' */ 510 inline bool _hasBCP47Extension(std::string_view id) { 511 return id.find('@') == std::string_view::npos && getShortestSubtagLength(id) == 1; 512 } 513 514 /* ### Keywords **************************************************/ 515 inline bool UPRV_ISDIGIT(char c) { return c >= '0' && c <= '9'; } 516 inline bool UPRV_ISALPHANUM(char c) { return uprv_isASCIILetter(c) || UPRV_ISDIGIT(c); } 517 /* Punctuation/symbols allowed in legacy key values */ 518 inline bool UPRV_OK_VALUE_PUNCTUATION(char c) { return c == '_' || c == '-' || c == '+' || c == '/'; } 519 520 } // namespace 521 522 #define ULOC_KEYWORD_BUFFER_LEN 25 523 #define ULOC_MAX_NO_KEYWORDS 25 524 525 U_CAPI const char * U_EXPORT2 526 locale_getKeywordsStart(std::string_view localeID) { 527 if (size_t pos = localeID.find('@'); pos != std::string_view::npos) { 528 return localeID.data() + pos; 529 } 530 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 531 else { 532 /* We do this because the @ sign is variant, and the @ sign used on one 533 EBCDIC machine won't be compiled the same way on other EBCDIC based 534 machines. */ 535 static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 }; 536 const uint8_t *charToFind = ebcdicSigns; 537 while(*charToFind) { 538 if (size_t pos = localeID.find(*charToFind); pos != std::string_view::npos) { 539 return localeID.data() + pos; 540 } 541 charToFind++; 542 } 543 } 544 #endif 545 return nullptr; 546 } 547 548 namespace { 549 550 /** 551 * @param keywordName incoming name to be canonicalized 552 * @param status return status (keyword too long) 553 * @return the keyword name 554 */ 555 CharString locale_canonKeywordName(std::string_view keywordName, UErrorCode& status) 556 { 557 if (U_FAILURE(status)) { return {}; } 558 CharString result; 559 560 for (char c : keywordName) { 561 if (!UPRV_ISALPHANUM(c)) { 562 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ 563 return {}; 564 } 565 result.append(uprv_tolower(c), status); 566 } 567 if (result.isEmpty()) { 568 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name */ 569 return {}; 570 } 571 572 return result; 573 } 574 575 typedef struct { 576 char keyword[ULOC_KEYWORD_BUFFER_LEN]; 577 int32_t keywordLen; 578 const char *valueStart; 579 int32_t valueLen; 580 } KeywordStruct; 581 582 int32_t U_CALLCONV 583 compareKeywordStructs(const void * /*context*/, const void *left, const void *right) { 584 const char* leftString = static_cast<const KeywordStruct*>(left)->keyword; 585 const char* rightString = static_cast<const KeywordStruct*>(right)->keyword; 586 return uprv_strcmp(leftString, rightString); 587 } 588 589 } // namespace 590 591 U_EXPORT CharString 592 ulocimp_getKeywords(std::string_view localeID, 593 char prev, 594 bool valuesToo, 595 UErrorCode& status) 596 { 597 return ByteSinkUtil::viaByteSinkToCharString( 598 [&](ByteSink& sink, UErrorCode& status) { 599 ulocimp_getKeywords(localeID, 600 prev, 601 sink, 602 valuesToo, 603 status); 604 }, 605 status); 606 } 607 608 U_EXPORT void 609 ulocimp_getKeywords(std::string_view localeID, 610 char prev, 611 ByteSink& sink, 612 bool valuesToo, 613 UErrorCode& status) 614 { 615 if (U_FAILURE(status)) { return; } 616 617 KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS]; 618 619 int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS; 620 int32_t numKeywords = 0; 621 size_t equalSign = std::string_view::npos; 622 size_t semicolon = std::string_view::npos; 623 int32_t i = 0, j, n; 624 625 if(prev == '@') { /* start of keyword definition */ 626 /* we will grab pairs, trim spaces, lowercase keywords, sort and return */ 627 do { 628 bool duplicate = false; 629 /* skip leading spaces */ 630 while (!localeID.empty() && localeID.front() == ' ') { 631 localeID.remove_prefix(1); 632 } 633 if (localeID.empty()) { /* handle trailing "; " */ 634 break; 635 } 636 if(numKeywords == maxKeywords) { 637 status = U_INTERNAL_PROGRAM_ERROR; 638 return; 639 } 640 equalSign = localeID.find('='); 641 semicolon = localeID.find(';'); 642 /* lack of '=' [foo@currency] is illegal */ 643 /* ';' before '=' [foo@currency;collation=pinyin] is illegal */ 644 if (equalSign == std::string_view::npos || 645 (semicolon != std::string_view::npos && semicolon < equalSign)) { 646 status = U_INVALID_FORMAT_ERROR; 647 return; 648 } 649 /* zero-length keyword is an error. */ 650 if (equalSign == 0) { 651 status = U_INVALID_FORMAT_ERROR; 652 return; 653 } 654 /* need to normalize both keyword and keyword name */ 655 if (equalSign >= ULOC_KEYWORD_BUFFER_LEN) { 656 /* keyword name too long for internal buffer */ 657 status = U_INTERNAL_PROGRAM_ERROR; 658 return; 659 } 660 for (i = 0, n = 0; static_cast<size_t>(i) < equalSign; ++i) { 661 if (localeID[i] != ' ') { 662 keywordList[numKeywords].keyword[n++] = uprv_tolower(localeID[i]); 663 } 664 } 665 666 keywordList[numKeywords].keyword[n] = 0; 667 keywordList[numKeywords].keywordLen = n; 668 /* now grab the value part. First we skip the '=' */ 669 equalSign++; 670 /* then we leading spaces */ 671 while (equalSign < localeID.length() && localeID[equalSign] == ' ') { 672 equalSign++; 673 } 674 675 /* Premature end or zero-length value */ 676 if (equalSign == localeID.length() || equalSign == semicolon) { 677 status = U_INVALID_FORMAT_ERROR; 678 return; 679 } 680 681 keywordList[numKeywords].valueStart = localeID.data() + equalSign; 682 683 std::string_view value = localeID; 684 if (semicolon != std::string_view::npos) { 685 value.remove_suffix(value.length() - semicolon); 686 localeID.remove_prefix(semicolon + 1); 687 } else { 688 localeID = {}; 689 } 690 value.remove_prefix(equalSign); 691 if (size_t last = value.find_last_not_of(' '); last != std::string_view::npos) { 692 value.remove_suffix(value.length() - last - 1); 693 } 694 keywordList[numKeywords].valueLen = static_cast<int32_t>(value.length()); 695 696 /* If this is a duplicate keyword, then ignore it */ 697 for (j=0; j<numKeywords; ++j) { 698 if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) { 699 duplicate = true; 700 break; 701 } 702 } 703 if (!duplicate) { 704 ++numKeywords; 705 } 706 } while (!localeID.empty()); 707 708 /* now we have a list of keywords */ 709 /* we need to sort it */ 710 uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, nullptr, false, &status); 711 712 /* Now construct the keyword part */ 713 for(i = 0; i < numKeywords; i++) { 714 sink.Append(keywordList[i].keyword, keywordList[i].keywordLen); 715 if(valuesToo) { 716 sink.Append("=", 1); 717 sink.Append(keywordList[i].valueStart, keywordList[i].valueLen); 718 if(i < numKeywords - 1) { 719 sink.Append(";", 1); 720 } 721 } else { 722 sink.Append("\0", 1); 723 } 724 } 725 } 726 } 727 728 U_CAPI int32_t U_EXPORT2 729 uloc_getKeywordValue(const char* localeID, 730 const char* keywordName, 731 char* buffer, int32_t bufferCapacity, 732 UErrorCode* status) 733 { 734 if (U_FAILURE(*status)) { return 0; } 735 if (keywordName == nullptr || *keywordName == '\0') { 736 *status = U_ILLEGAL_ARGUMENT_ERROR; 737 return 0; 738 } 739 return ByteSinkUtil::viaByteSinkToTerminatedChars( 740 buffer, bufferCapacity, 741 [&](ByteSink& sink, UErrorCode& status) { 742 ulocimp_getKeywordValue(localeID, keywordName, sink, status); 743 }, 744 *status); 745 } 746 747 U_EXPORT CharString 748 ulocimp_getKeywordValue(const char* localeID, 749 std::string_view keywordName, 750 UErrorCode& status) 751 { 752 return ByteSinkUtil::viaByteSinkToCharString( 753 [&](ByteSink& sink, UErrorCode& status) { 754 ulocimp_getKeywordValue(localeID, keywordName, sink, status); 755 }, 756 status); 757 } 758 759 U_EXPORT void 760 ulocimp_getKeywordValue(const char* localeID, 761 std::string_view keywordName, 762 icu::ByteSink& sink, 763 UErrorCode& status) 764 { 765 if (U_FAILURE(status)) { return; } 766 767 if (localeID == nullptr || keywordName.empty()) { 768 status = U_ILLEGAL_ARGUMENT_ERROR; 769 return; 770 } 771 772 const char* startSearchHere = nullptr; 773 const char* nextSeparator = nullptr; 774 775 CharString tempBuffer; 776 const char* tmpLocaleID; 777 778 CharString canonKeywordName = locale_canonKeywordName(keywordName, status); 779 if (U_FAILURE(status)) { 780 return; 781 } 782 783 if (localeID != nullptr && _hasBCP47Extension(localeID)) { 784 tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, status); 785 tmpLocaleID = U_SUCCESS(status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID; 786 } else { 787 tmpLocaleID=localeID; 788 } 789 790 startSearchHere = locale_getKeywordsStart(tmpLocaleID); 791 if(startSearchHere == nullptr) { 792 /* no keywords, return at once */ 793 return; 794 } 795 796 /* find the first keyword */ 797 while(startSearchHere) { 798 const char* keyValueTail; 799 800 startSearchHere++; /* skip @ or ; */ 801 nextSeparator = uprv_strchr(startSearchHere, '='); 802 if(!nextSeparator) { 803 status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */ 804 return; 805 } 806 /* strip leading & trailing spaces (TC decided to tolerate these) */ 807 while(*startSearchHere == ' ') { 808 startSearchHere++; 809 } 810 keyValueTail = nextSeparator; 811 while (keyValueTail > startSearchHere && *(keyValueTail-1) == ' ') { 812 keyValueTail--; 813 } 814 /* now keyValueTail points to first char after the keyName */ 815 /* copy & normalize keyName from locale */ 816 if (startSearchHere == keyValueTail) { 817 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */ 818 return; 819 } 820 CharString localeKeywordName; 821 while (startSearchHere < keyValueTail) { 822 if (!UPRV_ISALPHANUM(*startSearchHere)) { 823 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ 824 return; 825 } 826 localeKeywordName.append(uprv_tolower(*startSearchHere++), status); 827 } 828 if (U_FAILURE(status)) { 829 return; 830 } 831 832 startSearchHere = uprv_strchr(nextSeparator, ';'); 833 834 if (canonKeywordName == localeKeywordName) { 835 /* current entry matches the keyword. */ 836 nextSeparator++; /* skip '=' */ 837 /* First strip leading & trailing spaces (TC decided to tolerate these) */ 838 while(*nextSeparator == ' ') { 839 nextSeparator++; 840 } 841 keyValueTail = (startSearchHere)? startSearchHere: nextSeparator + uprv_strlen(nextSeparator); 842 while(keyValueTail > nextSeparator && *(keyValueTail-1) == ' ') { 843 keyValueTail--; 844 } 845 /* Now copy the value, but check well-formedness */ 846 if (nextSeparator == keyValueTail) { 847 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value name in passed-in locale */ 848 return; 849 } 850 while (nextSeparator < keyValueTail) { 851 if (!UPRV_ISALPHANUM(*nextSeparator) && !UPRV_OK_VALUE_PUNCTUATION(*nextSeparator)) { 852 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */ 853 return; 854 } 855 /* Should we lowercase value to return here? Tests expect as-is. */ 856 sink.Append(nextSeparator++, 1); 857 } 858 return; 859 } 860 } 861 } 862 863 U_CAPI int32_t U_EXPORT2 864 uloc_setKeywordValue(const char* keywordName, 865 const char* keywordValue, 866 char* buffer, int32_t bufferCapacity, 867 UErrorCode* status) 868 { 869 if (U_FAILURE(*status)) { return 0; } 870 871 if (keywordName == nullptr || *keywordName == 0) { 872 *status = U_ILLEGAL_ARGUMENT_ERROR; 873 return 0; 874 } 875 876 if (bufferCapacity <= 1) { 877 *status = U_ILLEGAL_ARGUMENT_ERROR; 878 return 0; 879 } 880 881 int32_t bufLen = (int32_t)uprv_strlen(buffer); 882 if(bufferCapacity<bufLen) { 883 /* The capacity is less than the length?! Is this NUL terminated? */ 884 *status = U_ILLEGAL_ARGUMENT_ERROR; 885 return 0; 886 } 887 888 char* keywords = const_cast<char*>( 889 locale_getKeywordsStart({buffer, static_cast<std::string_view::size_type>(bufLen)})); 890 int32_t baseLen = keywords == nullptr ? bufLen : keywords - buffer; 891 // Remove -1 from the capacity so that this function can guarantee NUL termination. 892 CheckedArrayByteSink sink(keywords == nullptr ? buffer + bufLen : keywords, 893 bufferCapacity - baseLen - 1); 894 int32_t reslen = ulocimp_setKeywordValue( 895 keywords == nullptr ? std::string_view() : keywords, 896 keywordName, 897 keywordValue == nullptr ? std::string_view() : keywordValue, 898 sink, 899 *status); 900 901 if (U_FAILURE(*status)) { 902 return *status == U_BUFFER_OVERFLOW_ERROR ? reslen + baseLen : 0; 903 } 904 905 // See the documentation for this function, it's guaranteed to never 906 // overflow the buffer but instead abort with BUFFER_OVERFLOW_ERROR. 907 // In this case, nothing has been written to the sink, so it cannot have Overflowed(). 908 U_ASSERT(!sink.Overflowed()); 909 U_ASSERT(reslen >= 0); 910 return u_terminateChars(buffer, bufferCapacity, reslen + baseLen, status); 911 } 912 913 U_EXPORT void 914 ulocimp_setKeywordValue(std::string_view keywordName, 915 std::string_view keywordValue, 916 CharString& localeID, 917 UErrorCode& status) 918 { 919 if (U_FAILURE(status)) { return; } 920 std::string_view keywords; 921 if (const char* start = locale_getKeywordsStart(localeID.toStringPiece()); start != nullptr) { 922 // This is safe because CharString::truncate() doesn't actually erase any 923 // data, but simply sets the position for where new data will be written. 924 int32_t size = start - localeID.data(); 925 keywords = localeID.toStringPiece(); 926 keywords.remove_prefix(size); 927 localeID.truncate(size); 928 } 929 CharStringByteSink sink(&localeID); 930 ulocimp_setKeywordValue(keywords, keywordName, keywordValue, sink, status); 931 } 932 933 U_EXPORT int32_t 934 ulocimp_setKeywordValue(std::string_view keywords, 935 std::string_view keywordName, 936 std::string_view keywordValue, 937 ByteSink& sink, 938 UErrorCode& status) 939 { 940 if (U_FAILURE(status)) { return 0; } 941 942 /* TODO: sorting. removal. */ 943 int32_t needLen = 0; 944 int32_t rc; 945 CharString updatedKeysAndValues; 946 bool handledInputKeyAndValue = false; 947 char keyValuePrefix = '@'; 948 949 if (status == U_STRING_NOT_TERMINATED_WARNING) { 950 status = U_ZERO_ERROR; 951 } 952 if (keywordName.empty()) { 953 status = U_ILLEGAL_ARGUMENT_ERROR; 954 return 0; 955 } 956 CharString canonKeywordName = locale_canonKeywordName(keywordName, status); 957 if (U_FAILURE(status)) { 958 return 0; 959 } 960 961 CharString canonKeywordValue; 962 for (char c : keywordValue) { 963 if (!UPRV_ISALPHANUM(c) && !UPRV_OK_VALUE_PUNCTUATION(c)) { 964 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed key value */ 965 return 0; 966 } 967 /* Should we force lowercase in value to set? */ 968 canonKeywordValue.append(c, status); 969 } 970 if (U_FAILURE(status)) { 971 return 0; 972 } 973 974 if (keywords.size() <= 1) { 975 if (canonKeywordValue.isEmpty()) { /* no keywords = nothing to remove */ 976 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); 977 return 0; 978 } 979 980 needLen = 1 + canonKeywordName.length() + 1 + canonKeywordValue.length(); 981 int32_t capacity = 0; 982 char* buffer = sink.GetAppendBuffer( 983 needLen, needLen, nullptr, needLen, &capacity); 984 if (capacity < needLen || buffer == nullptr) { 985 status = U_BUFFER_OVERFLOW_ERROR; 986 return needLen; /* no change */ 987 } 988 char* it = buffer; 989 990 *it++ = '@'; 991 uprv_memcpy(it, canonKeywordName.data(), canonKeywordName.length()); 992 it += canonKeywordName.length(); 993 *it++ = '='; 994 uprv_memcpy(it, canonKeywordValue.data(), canonKeywordValue.length()); 995 sink.Append(buffer, needLen); 996 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); 997 return needLen; 998 } /* end shortcut - no @ */ 999 1000 /* search for keyword */ 1001 for (size_t keywordStart = 0; keywordStart != std::string_view::npos;) { 1002 keywordStart++; /* skip @ or ; */ 1003 size_t nextEqualsign = keywords.find('=', keywordStart); 1004 if (nextEqualsign == std::string_view::npos) { 1005 status = U_ILLEGAL_ARGUMENT_ERROR; /* key must have =value */ 1006 return 0; 1007 } 1008 /* strip leading & trailing spaces (TC decided to tolerate these) */ 1009 while (keywordStart < keywords.size() && keywords[keywordStart] == ' ') { 1010 keywordStart++; 1011 } 1012 size_t keyValueTail = nextEqualsign; 1013 while (keyValueTail > keywordStart && keywords[keyValueTail - 1] == ' ') { 1014 keyValueTail--; 1015 } 1016 /* now keyValueTail points to first char after the keyName */ 1017 /* copy & normalize keyName from locale */ 1018 if (keywordStart == keyValueTail) { 1019 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty keyword name in passed-in locale */ 1020 return 0; 1021 } 1022 CharString localeKeywordName; 1023 while (keywordStart < keyValueTail) { 1024 if (!UPRV_ISALPHANUM(keywords[keywordStart])) { 1025 status = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ 1026 return 0; 1027 } 1028 localeKeywordName.append(uprv_tolower(keywords[keywordStart++]), status); 1029 } 1030 if (U_FAILURE(status)) { 1031 return 0; 1032 } 1033 1034 size_t nextSeparator = keywords.find(';', nextEqualsign); 1035 1036 /* start processing the value part */ 1037 nextEqualsign++; /* skip '=' */ 1038 /* First strip leading & trailing spaces (TC decided to tolerate these) */ 1039 while (nextEqualsign < keywords.size() && keywords[nextEqualsign] == ' ') { 1040 nextEqualsign++; 1041 } 1042 keyValueTail = nextSeparator == std::string_view::npos ? keywords.size() : nextSeparator; 1043 while (keyValueTail > nextEqualsign && keywords[keyValueTail - 1] == ' ') { 1044 keyValueTail--; 1045 } 1046 if (nextEqualsign == keyValueTail) { 1047 status = U_ILLEGAL_ARGUMENT_ERROR; /* empty key value in passed-in locale */ 1048 return 0; 1049 } 1050 1051 rc = uprv_strcmp(canonKeywordName.data(), localeKeywordName.data()); 1052 if(rc == 0) { 1053 /* Current entry matches the input keyword. Update the entry */ 1054 if (!canonKeywordValue.isEmpty()) { /* updating a value */ 1055 updatedKeysAndValues.append(keyValuePrefix, status); 1056 keyValuePrefix = ';'; /* for any subsequent key-value pair */ 1057 updatedKeysAndValues.append(canonKeywordName, status); 1058 updatedKeysAndValues.append('=', status); 1059 updatedKeysAndValues.append(canonKeywordValue, status); 1060 } /* else removing this entry, don't emit anything */ 1061 handledInputKeyAndValue = true; 1062 } else { 1063 /* input keyword sorts earlier than current entry, add before current entry */ 1064 if (rc < 0 && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) { 1065 /* insert new entry at this location */ 1066 updatedKeysAndValues.append(keyValuePrefix, status); 1067 keyValuePrefix = ';'; /* for any subsequent key-value pair */ 1068 updatedKeysAndValues.append(canonKeywordName, status); 1069 updatedKeysAndValues.append('=', status); 1070 updatedKeysAndValues.append(canonKeywordValue, status); 1071 handledInputKeyAndValue = true; 1072 } 1073 /* copy the current entry */ 1074 updatedKeysAndValues.append(keyValuePrefix, status); 1075 keyValuePrefix = ';'; /* for any subsequent key-value pair */ 1076 updatedKeysAndValues.append(localeKeywordName, status); 1077 updatedKeysAndValues.append('=', status); 1078 updatedKeysAndValues.append(keywords.data() + nextEqualsign, 1079 static_cast<int32_t>(keyValueTail - nextEqualsign), status); 1080 } 1081 if (nextSeparator == std::string_view::npos && !canonKeywordValue.isEmpty() && !handledInputKeyAndValue) { 1082 /* append new entry at the end, it sorts later than existing entries */ 1083 updatedKeysAndValues.append(keyValuePrefix, status); 1084 /* skip keyValuePrefix update, no subsequent key-value pair */ 1085 updatedKeysAndValues.append(canonKeywordName, status); 1086 updatedKeysAndValues.append('=', status); 1087 updatedKeysAndValues.append(canonKeywordValue, status); 1088 handledInputKeyAndValue = true; 1089 } 1090 keywordStart = nextSeparator; 1091 } /* end loop searching */ 1092 1093 /* Any error from updatedKeysAndValues.append above would be internal and not due to 1094 * problems with the passed-in locale. So if we did encounter problems with the 1095 * passed-in locale above, those errors took precedence and overrode any error 1096 * status from updatedKeysAndValues.append, and also caused a return of 0. If there 1097 * are errors here they are from updatedKeysAndValues.append; they do cause an 1098 * error return but the passed-in locale is unmodified and the original bufLen is 1099 * returned. 1100 */ 1101 if (!handledInputKeyAndValue || U_FAILURE(status)) { 1102 /* if input key/value specified removal of a keyword not present in locale, or 1103 * there was an error in CharString.append, leave original locale alone. */ 1104 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); 1105 // The sink is expected to be a buffer which already contains the full 1106 // locale string, so when it isn't going to be modified there's no need 1107 // to actually write any data to it, as the data is already there. Only 1108 // the first character needs to be overwritten (changing '\0' to '@'). 1109 needLen = static_cast<int32_t>(keywords.size()); 1110 int32_t capacity = 0; 1111 char* buffer = sink.GetAppendBuffer( 1112 needLen, needLen, nullptr, needLen, &capacity); 1113 if (capacity < needLen || buffer == nullptr) { 1114 status = U_BUFFER_OVERFLOW_ERROR; 1115 } else { 1116 *buffer = '@'; 1117 sink.Append(buffer, needLen); 1118 } 1119 return needLen; 1120 } 1121 1122 needLen = updatedKeysAndValues.length(); 1123 // Check to see can we fit the updatedKeysAndValues, if not, return 1124 // U_BUFFER_OVERFLOW_ERROR without copy updatedKeysAndValues into it. 1125 // We do this because this API function does not behave like most others: 1126 // It promises never to set a U_STRING_NOT_TERMINATED_WARNING. 1127 // When the contents fits but without the terminating NUL, in this case we need to not change 1128 // the buffer contents and return with a buffer overflow error. 1129 if (needLen > 0) { 1130 int32_t capacity = 0; 1131 char* buffer = sink.GetAppendBuffer( 1132 needLen, needLen, nullptr, needLen, &capacity); 1133 if (capacity < needLen || buffer == nullptr) { 1134 status = U_BUFFER_OVERFLOW_ERROR; 1135 return needLen; 1136 } 1137 uprv_memcpy(buffer, updatedKeysAndValues.data(), needLen); 1138 sink.Append(buffer, needLen); 1139 } 1140 U_ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); 1141 return needLen; 1142 } 1143 1144 /* ### ID parsing implementation **************************************************/ 1145 1146 namespace { 1147 1148 inline bool _isPrefixLetter(char a) { return a == 'x' || a == 'X' || a == 'i' || a == 'I'; } 1149 1150 /*returns true if one of the special prefixes is here (s=string) 1151 'x-' or 'i-' */ 1152 inline bool _isIDPrefix(std::string_view s) { 1153 return s.size() >= 2 && _isPrefixLetter(s[0]) && _isIDSeparator(s[1]); 1154 } 1155 1156 /* Dot terminates it because of POSIX form where dot precedes the codepage 1157 * except for variant 1158 */ 1159 inline bool _isTerminator(char a) { return a == '.' || a == '@'; } 1160 1161 inline bool _isBCP47Extension(std::string_view p) { 1162 return p.size() >= 3 && 1163 p[0] == '-' && 1164 (p[1] == 't' || p[1] == 'T' || 1165 p[1] == 'u' || p[1] == 'U' || 1166 p[1] == 'x' || p[1] == 'X') && 1167 p[2] == '-'; 1168 } 1169 1170 /** 1171 * Lookup 'key' in the array 'list'. The array 'list' should contain 1172 * a nullptr entry, followed by more entries, and a second nullptr entry. 1173 * 1174 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or 1175 * COUNTRIES_3. 1176 */ 1177 std::optional<int16_t> _findIndex(const char* const* list, const char* key) 1178 { 1179 const char* const* anchor = list; 1180 int32_t pass = 0; 1181 1182 /* Make two passes through two nullptr-terminated arrays at 'list' */ 1183 while (pass++ < 2) { 1184 while (*list) { 1185 if (uprv_strcmp(key, *list) == 0) { 1186 return static_cast<int16_t>(list - anchor); 1187 } 1188 list++; 1189 } 1190 ++list; /* skip final nullptr *CWB*/ 1191 } 1192 return std::nullopt; 1193 } 1194 1195 } // namespace 1196 1197 U_CFUNC const char* 1198 uloc_getCurrentCountryID(const char* oldID){ 1199 std::optional<int16_t> offset = _findIndex(DEPRECATED_COUNTRIES, oldID); 1200 return offset.has_value() ? REPLACEMENT_COUNTRIES[*offset] : oldID; 1201 } 1202 U_CFUNC const char* 1203 uloc_getCurrentLanguageID(const char* oldID){ 1204 std::optional<int16_t> offset = _findIndex(DEPRECATED_LANGUAGES, oldID); 1205 return offset.has_value() ? REPLACEMENT_LANGUAGES[*offset] : oldID; 1206 } 1207 1208 namespace { 1209 1210 /* 1211 * the internal functions _getLanguage(), _getScript(), _getRegion(), _getVariant() 1212 * avoid duplicating code to handle the earlier locale ID pieces 1213 * in the functions for the later ones by 1214 * setting the *pEnd pointer to where they stopped parsing 1215 * 1216 * TODO try to use this in Locale 1217 */ 1218 1219 size_t _getLanguage(std::string_view localeID, ByteSink* sink, UErrorCode& status) { 1220 size_t skip = 0; 1221 if (localeID.size() == 4 && uprv_strnicmp(localeID.data(), "root", 4) == 0) { 1222 skip = 4; 1223 localeID.remove_prefix(skip); 1224 } else if (localeID.size() >= 3 && uprv_strnicmp(localeID.data(), "und", 3) == 0 && 1225 (localeID.size() == 3 || 1226 localeID[3] == '-' || 1227 localeID[3] == '_' || 1228 localeID[3] == '@')) { 1229 skip = 3; 1230 localeID.remove_prefix(skip); 1231 } 1232 1233 constexpr int32_t MAXLEN = ULOC_LANG_CAPACITY - 1; // Minus NUL. 1234 1235 /* if it starts with i- or x- then copy that prefix */ 1236 size_t len = _isIDPrefix(localeID) ? 2 : 0; 1237 while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) { 1238 if (len == MAXLEN) { 1239 status = U_ILLEGAL_ARGUMENT_ERROR; 1240 return 0; 1241 } 1242 len++; 1243 } 1244 1245 if (sink == nullptr || len == 0) { return skip + len; } 1246 1247 int32_t minCapacity = uprv_max(static_cast<int32_t>(len), 4); // Minimum 3 letters plus NUL. 1248 char scratch[MAXLEN]; 1249 int32_t capacity = 0; 1250 char* buffer = sink->GetAppendBuffer( 1251 minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity); 1252 1253 for (size_t i = 0; i < len; ++i) { 1254 buffer[i] = uprv_tolower(localeID[i]); 1255 } 1256 if (localeID.size() >= 2 && _isIDSeparator(localeID[1])) { 1257 buffer[1] = '-'; 1258 } 1259 1260 if (len == 3) { 1261 /* convert 3 character code to 2 character code if possible *CWB*/ 1262 U_ASSERT(capacity >= 4); 1263 buffer[3] = '\0'; 1264 std::optional<int16_t> offset = _findIndex(LANGUAGES_3, buffer); 1265 if (offset.has_value()) { 1266 const char* const alias = LANGUAGES[*offset]; 1267 sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias))); 1268 return skip + len; 1269 } 1270 } 1271 1272 sink->Append(buffer, static_cast<int32_t>(len)); 1273 return skip + len; 1274 } 1275 1276 size_t _getScript(std::string_view localeID, ByteSink* sink) { 1277 constexpr int32_t LENGTH = 4; 1278 1279 size_t len = 0; 1280 while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len]) && 1281 uprv_isASCIILetter(localeID[len])) { 1282 if (len == LENGTH) { return 0; } 1283 len++; 1284 } 1285 if (len != LENGTH) { return 0; } 1286 1287 if (sink == nullptr) { return len; } 1288 1289 char scratch[LENGTH]; 1290 int32_t capacity = 0; 1291 char* buffer = sink->GetAppendBuffer( 1292 LENGTH, LENGTH, scratch, UPRV_LENGTHOF(scratch), &capacity); 1293 1294 buffer[0] = uprv_toupper(localeID[0]); 1295 for (int32_t i = 1; i < LENGTH; ++i) { 1296 buffer[i] = uprv_tolower(localeID[i]); 1297 } 1298 1299 sink->Append(buffer, LENGTH); 1300 return len; 1301 } 1302 1303 size_t _getRegion(std::string_view localeID, ByteSink* sink) { 1304 constexpr int32_t MINLEN = 2; 1305 constexpr int32_t MAXLEN = ULOC_COUNTRY_CAPACITY - 1; // Minus NUL. 1306 1307 size_t len = 0; 1308 while (len < localeID.size() && !_isTerminator(localeID[len]) && !_isIDSeparator(localeID[len])) { 1309 if (len == MAXLEN) { return 0; } 1310 len++; 1311 } 1312 if (len < MINLEN) { return 0; } 1313 1314 if (sink == nullptr) { return len; } 1315 1316 char scratch[ULOC_COUNTRY_CAPACITY]; 1317 int32_t capacity = 0; 1318 char* buffer = sink->GetAppendBuffer( 1319 ULOC_COUNTRY_CAPACITY, 1320 ULOC_COUNTRY_CAPACITY, 1321 scratch, 1322 UPRV_LENGTHOF(scratch), 1323 &capacity); 1324 1325 for (size_t i = 0; i < len; ++i) { 1326 buffer[i] = uprv_toupper(localeID[i]); 1327 } 1328 1329 if (len == 3) { 1330 /* convert 3 character code to 2 character code if possible *CWB*/ 1331 U_ASSERT(capacity >= 4); 1332 buffer[3] = '\0'; 1333 std::optional<int16_t> offset = _findIndex(COUNTRIES_3, buffer); 1334 if (offset.has_value()) { 1335 const char* const alias = COUNTRIES[*offset]; 1336 sink->Append(alias, static_cast<int32_t>(uprv_strlen(alias))); 1337 return len; 1338 } 1339 } 1340 1341 sink->Append(buffer, static_cast<int32_t>(len)); 1342 return len; 1343 } 1344 1345 /** 1346 * @param needSeparator if true, then add leading '_' if any variants 1347 * are added to 'variant' 1348 */ 1349 size_t 1350 _getVariant(std::string_view localeID, 1351 char prev, 1352 ByteSink* sink, 1353 bool needSeparator, 1354 UErrorCode& status) { 1355 if (U_FAILURE(status) || localeID.empty()) return 0; 1356 1357 // Reasonable upper limit for variants 1358 // There are no strict limitation of the syntax of variant in the legacy 1359 // locale format. If the locale is constructed from unicode_locale_id 1360 // as defined in UTS35, then we know each unicode_variant_subtag 1361 // could have max length of 8 ((alphanum{5,8} | digit alphanum{3}) 1362 // 179 would allow 20 unicode_variant_subtag with sep in the 1363 // unicode_locale_id 1364 // 8*20 + 1*(20-1) = 179 1365 constexpr int32_t MAX_VARIANTS_LENGTH = 179; 1366 1367 /* get one or more variant tags and separate them with '_' */ 1368 size_t index = 0; 1369 if (_isIDSeparator(prev)) { 1370 /* get a variant string after a '-' or '_' */ 1371 for (std::string_view sub = localeID;;) { 1372 size_t next = sub.find_first_of(".@_-"); 1373 // For historical reasons, a trailing separator is included in the variant. 1374 bool finished = next == std::string_view::npos || next + 1 == sub.length(); 1375 size_t limit = finished ? sub.length() : next; 1376 index += limit; 1377 if (index > MAX_VARIANTS_LENGTH) { 1378 status = U_ILLEGAL_ARGUMENT_ERROR; 1379 return 0; 1380 } 1381 1382 if (sink != nullptr) { 1383 if (needSeparator) { 1384 sink->Append("_", 1); 1385 } else { 1386 needSeparator = true; 1387 } 1388 1389 int32_t length = static_cast<int32_t>(limit); 1390 int32_t minCapacity = uprv_min(length, MAX_VARIANTS_LENGTH); 1391 char scratch[MAX_VARIANTS_LENGTH]; 1392 int32_t capacity = 0; 1393 char* buffer = sink->GetAppendBuffer( 1394 minCapacity, minCapacity, scratch, UPRV_LENGTHOF(scratch), &capacity); 1395 1396 for (size_t i = 0; i < limit; ++i) { 1397 buffer[i] = uprv_toupper(sub[i]); 1398 } 1399 sink->Append(buffer, length); 1400 } 1401 1402 if (finished) { return index; } 1403 sub.remove_prefix(next); 1404 if (_isTerminator(sub.front()) || _isBCP47Extension(sub)) { return index; } 1405 sub.remove_prefix(1); 1406 index++; 1407 } 1408 } 1409 1410 size_t skip = 0; 1411 /* if there is no variant tag after a '-' or '_' then look for '@' */ 1412 if (prev == '@') { 1413 /* keep localeID */ 1414 } else if (const char* p = locale_getKeywordsStart(localeID); p != nullptr) { 1415 skip = 1 + p - localeID.data(); /* point after the '@' */ 1416 localeID.remove_prefix(skip); 1417 } else { 1418 return 0; 1419 } 1420 for (; index < localeID.size() && !_isTerminator(localeID[index]); index++) { 1421 if (index >= MAX_VARIANTS_LENGTH) { // same as length > MAX_VARIANTS_LENGTH 1422 status = U_ILLEGAL_ARGUMENT_ERROR; 1423 return 0; 1424 } 1425 if (needSeparator) { 1426 if (sink != nullptr) { 1427 sink->Append("_", 1); 1428 } 1429 needSeparator = false; 1430 } 1431 if (sink != nullptr) { 1432 char c = uprv_toupper(localeID[index]); 1433 if (c == '-' || c == ',') c = '_'; 1434 sink->Append(&c, 1); 1435 } 1436 } 1437 return skip + index; 1438 } 1439 1440 } // namespace 1441 1442 U_EXPORT CharString 1443 ulocimp_getLanguage(std::string_view localeID, UErrorCode& status) { 1444 return ByteSinkUtil::viaByteSinkToCharString( 1445 [&](ByteSink& sink, UErrorCode& status) { 1446 ulocimp_getSubtags( 1447 localeID, 1448 &sink, 1449 nullptr, 1450 nullptr, 1451 nullptr, 1452 nullptr, 1453 status); 1454 }, 1455 status); 1456 } 1457 1458 U_EXPORT CharString 1459 ulocimp_getScript(std::string_view localeID, UErrorCode& status) { 1460 return ByteSinkUtil::viaByteSinkToCharString( 1461 [&](ByteSink& sink, UErrorCode& status) { 1462 ulocimp_getSubtags( 1463 localeID, 1464 nullptr, 1465 &sink, 1466 nullptr, 1467 nullptr, 1468 nullptr, 1469 status); 1470 }, 1471 status); 1472 } 1473 1474 U_EXPORT CharString 1475 ulocimp_getRegion(std::string_view localeID, UErrorCode& status) { 1476 return ByteSinkUtil::viaByteSinkToCharString( 1477 [&](ByteSink& sink, UErrorCode& status) { 1478 ulocimp_getSubtags( 1479 localeID, 1480 nullptr, 1481 nullptr, 1482 &sink, 1483 nullptr, 1484 nullptr, 1485 status); 1486 }, 1487 status); 1488 } 1489 1490 U_EXPORT CharString 1491 ulocimp_getVariant(std::string_view localeID, UErrorCode& status) { 1492 return ByteSinkUtil::viaByteSinkToCharString( 1493 [&](ByteSink& sink, UErrorCode& status) { 1494 ulocimp_getSubtags( 1495 localeID, 1496 nullptr, 1497 nullptr, 1498 nullptr, 1499 &sink, 1500 nullptr, 1501 status); 1502 }, 1503 status); 1504 } 1505 1506 U_EXPORT void 1507 ulocimp_getSubtags( 1508 std::string_view localeID, 1509 CharString* language, 1510 CharString* script, 1511 CharString* region, 1512 CharString* variant, 1513 const char** pEnd, 1514 UErrorCode& status) { 1515 if (U_FAILURE(status)) { return; } 1516 1517 std::optional<CharStringByteSink> languageSink; 1518 std::optional<CharStringByteSink> scriptSink; 1519 std::optional<CharStringByteSink> regionSink; 1520 std::optional<CharStringByteSink> variantSink; 1521 1522 if (language != nullptr) { languageSink.emplace(language); } 1523 if (script != nullptr) { scriptSink.emplace(script); } 1524 if (region != nullptr) { regionSink.emplace(region); } 1525 if (variant != nullptr) { variantSink.emplace(variant); } 1526 1527 ulocimp_getSubtags( 1528 localeID, 1529 languageSink.has_value() ? &*languageSink : nullptr, 1530 scriptSink.has_value() ? &*scriptSink : nullptr, 1531 regionSink.has_value() ? &*regionSink : nullptr, 1532 variantSink.has_value() ? &*variantSink : nullptr, 1533 pEnd, 1534 status); 1535 } 1536 1537 U_EXPORT void 1538 ulocimp_getSubtags( 1539 std::string_view localeID, 1540 ByteSink* language, 1541 ByteSink* script, 1542 ByteSink* region, 1543 ByteSink* variant, 1544 const char** pEnd, 1545 UErrorCode& status) { 1546 if (U_FAILURE(status)) { return; } 1547 1548 if (pEnd != nullptr) { 1549 *pEnd = localeID.data(); 1550 } else if (language == nullptr && 1551 script == nullptr && 1552 region == nullptr && 1553 variant == nullptr) { 1554 return; 1555 } 1556 1557 if (localeID.empty()) { return; } 1558 1559 bool hasRegion = false; 1560 1561 { 1562 size_t len = _getLanguage(localeID, language, status); 1563 if (U_FAILURE(status)) { return; } 1564 if (len > 0) { 1565 localeID.remove_prefix(len); 1566 } 1567 } 1568 1569 if (pEnd != nullptr) { 1570 *pEnd = localeID.data(); 1571 } else if (script == nullptr && 1572 region == nullptr && 1573 variant == nullptr) { 1574 return; 1575 } 1576 1577 if (localeID.empty()) { return; } 1578 1579 if (_isIDSeparator(localeID.front())) { 1580 std::string_view sub = localeID; 1581 sub.remove_prefix(1); 1582 size_t len = _getScript(sub, script); 1583 if (len > 0) { 1584 localeID.remove_prefix(len + 1); 1585 if (pEnd != nullptr) { *pEnd = localeID.data(); } 1586 } 1587 } 1588 1589 if ((region == nullptr && variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; } 1590 1591 if (_isIDSeparator(localeID.front())) { 1592 std::string_view sub = localeID; 1593 sub.remove_prefix(1); 1594 size_t len = _getRegion(sub, region); 1595 if (len > 0) { 1596 hasRegion = true; 1597 localeID.remove_prefix(len + 1); 1598 if (pEnd != nullptr) { *pEnd = localeID.data(); } 1599 } 1600 } 1601 1602 if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; } 1603 1604 bool hasVariant = false; 1605 1606 if (_isIDSeparator(localeID.front()) && !_isBCP47Extension(localeID)) { 1607 std::string_view sub = localeID; 1608 /* If there was no country ID, skip a possible extra IDSeparator */ 1609 size_t skip = !hasRegion && localeID.size() > 1 && _isIDSeparator(localeID[1]) ? 2 : 1; 1610 sub.remove_prefix(skip); 1611 size_t len = _getVariant(sub, localeID[0], variant, false, status); 1612 if (U_FAILURE(status)) { return; } 1613 if (len > 0) { 1614 hasVariant = true; 1615 localeID.remove_prefix(skip + len); 1616 if (pEnd != nullptr) { *pEnd = localeID.data(); } 1617 } 1618 } 1619 1620 if ((variant == nullptr && pEnd == nullptr) || localeID.empty()) { return; } 1621 1622 if (_isBCP47Extension(localeID)) { 1623 localeID.remove_prefix(2); 1624 constexpr char vaposix[] = "-va-posix"; 1625 constexpr size_t length = sizeof vaposix - 1; 1626 for (size_t next;; localeID.remove_prefix(next)) { 1627 next = localeID.find('-', 1); 1628 if (next == std::string_view::npos) { break; } 1629 next = localeID.find('-', next + 1); 1630 bool finished = next == std::string_view::npos; 1631 std::string_view sub = localeID; 1632 if (!finished) { sub.remove_suffix(sub.length() - next); } 1633 1634 if (sub.length() == length && uprv_strnicmp(sub.data(), vaposix, length) == 0) { 1635 if (variant != nullptr) { 1636 if (hasVariant) { variant->Append("_", 1); } 1637 constexpr char posix[] = "POSIX"; 1638 variant->Append(posix, sizeof posix - 1); 1639 } 1640 if (pEnd != nullptr) { *pEnd = localeID.data() + length; } 1641 } 1642 1643 if (finished) { break; } 1644 } 1645 } 1646 } 1647 1648 /* Keyword enumeration */ 1649 1650 typedef struct UKeywordsContext { 1651 char* keywords; 1652 char* current; 1653 } UKeywordsContext; 1654 1655 U_CDECL_BEGIN 1656 1657 static void U_CALLCONV 1658 uloc_kw_closeKeywords(UEnumeration *enumerator) { 1659 uprv_free(((UKeywordsContext *)enumerator->context)->keywords); 1660 uprv_free(enumerator->context); 1661 uprv_free(enumerator); 1662 } 1663 1664 static int32_t U_CALLCONV 1665 uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) { 1666 char *kw = ((UKeywordsContext *)en->context)->keywords; 1667 int32_t result = 0; 1668 while(*kw) { 1669 result++; 1670 kw += uprv_strlen(kw)+1; 1671 } 1672 return result; 1673 } 1674 1675 static const char * U_CALLCONV 1676 uloc_kw_nextKeyword(UEnumeration* en, 1677 int32_t* resultLength, 1678 UErrorCode* /*status*/) { 1679 const char* result = ((UKeywordsContext *)en->context)->current; 1680 int32_t len = 0; 1681 if(*result) { 1682 len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current); 1683 ((UKeywordsContext *)en->context)->current += len+1; 1684 } else { 1685 result = nullptr; 1686 } 1687 if (resultLength) { 1688 *resultLength = len; 1689 } 1690 return result; 1691 } 1692 1693 static void U_CALLCONV 1694 uloc_kw_resetKeywords(UEnumeration* en, 1695 UErrorCode* /*status*/) { 1696 ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords; 1697 } 1698 1699 U_CDECL_END 1700 1701 1702 static const UEnumeration gKeywordsEnum = { 1703 nullptr, 1704 nullptr, 1705 uloc_kw_closeKeywords, 1706 uloc_kw_countKeywords, 1707 uenum_unextDefault, 1708 uloc_kw_nextKeyword, 1709 uloc_kw_resetKeywords 1710 }; 1711 1712 U_CAPI UEnumeration* U_EXPORT2 1713 uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status) 1714 { 1715 if (U_FAILURE(*status)) { return nullptr; } 1716 1717 LocalMemory<UKeywordsContext> myContext; 1718 LocalMemory<UEnumeration> result; 1719 1720 myContext.adoptInstead(static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)))); 1721 result.adoptInstead(static_cast<UEnumeration *>(uprv_malloc(sizeof(UEnumeration)))); 1722 if (myContext.isNull() || result.isNull()) { 1723 *status = U_MEMORY_ALLOCATION_ERROR; 1724 return nullptr; 1725 } 1726 uprv_memcpy(result.getAlias(), &gKeywordsEnum, sizeof(UEnumeration)); 1727 myContext->keywords = static_cast<char *>(uprv_malloc(keywordListSize+1)); 1728 if (myContext->keywords == nullptr) { 1729 *status = U_MEMORY_ALLOCATION_ERROR; 1730 return nullptr; 1731 } 1732 uprv_memcpy(myContext->keywords, keywordList, keywordListSize); 1733 myContext->keywords[keywordListSize] = 0; 1734 myContext->current = myContext->keywords; 1735 result->context = myContext.orphan(); 1736 return result.orphan(); 1737 } 1738 1739 U_CAPI UEnumeration* U_EXPORT2 1740 uloc_openKeywords(const char* localeID, 1741 UErrorCode* status) 1742 { 1743 if(status==nullptr || U_FAILURE(*status)) { 1744 return nullptr; 1745 } 1746 1747 CharString tempBuffer; 1748 const char* tmpLocaleID; 1749 1750 if (localeID != nullptr && _hasBCP47Extension(localeID)) { 1751 tempBuffer = ulocimp_forLanguageTag(localeID, -1, nullptr, *status); 1752 tmpLocaleID = U_SUCCESS(*status) && !tempBuffer.isEmpty() ? tempBuffer.data() : localeID; 1753 } else { 1754 if (localeID==nullptr) { 1755 localeID=uloc_getDefault(); 1756 } 1757 tmpLocaleID=localeID; 1758 } 1759 1760 ulocimp_getSubtags( 1761 tmpLocaleID, 1762 nullptr, 1763 nullptr, 1764 nullptr, 1765 nullptr, 1766 &tmpLocaleID, 1767 *status); 1768 if (U_FAILURE(*status)) { 1769 return nullptr; 1770 } 1771 1772 /* keywords are located after '@' */ 1773 if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != nullptr) { 1774 CharString keywords = ulocimp_getKeywords(tmpLocaleID + 1, '@', false, *status); 1775 if (U_FAILURE(*status)) { 1776 return nullptr; 1777 } 1778 return uloc_openKeywordList(keywords.data(), keywords.length(), status); 1779 } 1780 return nullptr; 1781 } 1782 1783 1784 /* bit-flags for 'options' parameter of _canonicalize */ 1785 #define _ULOC_STRIP_KEYWORDS 0x2 1786 #define _ULOC_CANONICALIZE 0x1 1787 1788 namespace { 1789 1790 inline bool OPTION_SET(uint32_t options, uint32_t mask) { return (options & mask) != 0; } 1791 1792 constexpr char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}; 1793 constexpr int32_t I_DEFAULT_LENGTH = UPRV_LENGTHOF(i_default); 1794 1795 /** 1796 * Canonicalize the given localeID, to level 1 or to level 2, 1797 * depending on the options. To specify level 1, pass in options=0. 1798 * To specify level 2, pass in options=_ULOC_CANONICALIZE. 1799 * 1800 * This is the code underlying uloc_getName and uloc_canonicalize. 1801 */ 1802 void 1803 _canonicalize(std::string_view localeID, 1804 ByteSink& sink, 1805 uint32_t options, 1806 UErrorCode& err) { 1807 if (U_FAILURE(err)) { 1808 return; 1809 } 1810 1811 int32_t j, fieldCount=0; 1812 CharString tempBuffer; // if localeID has a BCP47 extension, tmpLocaleID points to this 1813 CharString localeIDWithHyphens; // if localeID has a BPC47 extension and have _, tmpLocaleID points to this 1814 std::string_view origLocaleID; 1815 std::string_view tmpLocaleID; 1816 size_t keywordAssign = std::string_view::npos; 1817 size_t separatorIndicator = std::string_view::npos; 1818 1819 if (_hasBCP47Extension(localeID)) { 1820 std::string_view localeIDPtr = localeID; 1821 1822 // convert all underbars to hyphens, unless the "BCP47 extension" comes at the beginning of the string 1823 if (localeID.size() >= 2 && localeID.find('_') != std::string_view::npos && localeID[1] != '-' && localeID[1] != '_') { 1824 localeIDWithHyphens.append(localeID, err); 1825 if (U_SUCCESS(err)) { 1826 for (char* p = localeIDWithHyphens.data(); *p != '\0'; ++p) { 1827 if (*p == '_') { 1828 *p = '-'; 1829 } 1830 } 1831 localeIDPtr = localeIDWithHyphens.toStringPiece(); 1832 } 1833 } 1834 1835 tempBuffer = ulocimp_forLanguageTag(localeIDPtr.data(), static_cast<int32_t>(localeIDPtr.size()), nullptr, err); 1836 tmpLocaleID = U_SUCCESS(err) && !tempBuffer.isEmpty() ? static_cast<std::string_view>(tempBuffer.toStringPiece()) : localeIDPtr; 1837 } else { 1838 tmpLocaleID=localeID; 1839 } 1840 1841 origLocaleID=tmpLocaleID; 1842 1843 /* get all pieces, one after another, and separate with '_' */ 1844 CharString tag; 1845 CharString script; 1846 CharString country; 1847 CharString variant; 1848 const char* end = nullptr; 1849 ulocimp_getSubtags( 1850 tmpLocaleID, 1851 &tag, 1852 &script, 1853 &country, 1854 &variant, 1855 &end, 1856 err); 1857 if (U_FAILURE(err)) { 1858 return; 1859 } 1860 U_ASSERT(end != nullptr); 1861 if (end > tmpLocaleID.data()) { 1862 tmpLocaleID.remove_prefix(end - tmpLocaleID.data()); 1863 } 1864 1865 if (tag.length() == I_DEFAULT_LENGTH && origLocaleID.length() >= I_DEFAULT_LENGTH && 1866 uprv_strncmp(origLocaleID.data(), i_default, I_DEFAULT_LENGTH) == 0) { 1867 tag.clear(); 1868 tag.append(uloc_getDefault(), err); 1869 } else { 1870 if (!script.isEmpty()) { 1871 ++fieldCount; 1872 tag.append('_', err); 1873 tag.append(script, err); 1874 } 1875 if (!country.isEmpty()) { 1876 ++fieldCount; 1877 tag.append('_', err); 1878 tag.append(country, err); 1879 } 1880 if (!variant.isEmpty()) { 1881 ++fieldCount; 1882 if (country.isEmpty()) { 1883 tag.append('_', err); 1884 } 1885 tag.append('_', err); 1886 tag.append(variant, err); 1887 } 1888 } 1889 1890 /* Copy POSIX-style charset specifier, if any [mr.utf8] */ 1891 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && !tmpLocaleID.empty() && tmpLocaleID.front() == '.') { 1892 tag.append('.', err); 1893 tmpLocaleID.remove_prefix(1); 1894 size_t length; 1895 if (size_t atPos = tmpLocaleID.find('@'); atPos != std::string_view::npos) { 1896 length = atPos; 1897 } else { 1898 length = tmpLocaleID.length(); 1899 } 1900 // The longest charset name we found in IANA charset registry 1901 // https://www.iana.org/assignments/character-sets/ is 1902 // "Extended_UNIX_Code_Packed_Format_for_Japanese" in length 45. 1903 // we therefore restrict the length here to be 64 which is a power of 2 1904 // number that is longer than 45. 1905 constexpr size_t kMaxCharsetLength = 64; 1906 if (length > kMaxCharsetLength) { 1907 err = U_ILLEGAL_ARGUMENT_ERROR; /* malformed keyword name */ 1908 return; 1909 } 1910 if (length > 0) { 1911 tag.append(tmpLocaleID.data(), static_cast<int32_t>(length), err); 1912 tmpLocaleID.remove_prefix(length); 1913 } 1914 } 1915 1916 /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';' 1917 After this, tmpLocaleID either starts at '@' or is empty. */ 1918 if (const char* start = locale_getKeywordsStart(tmpLocaleID); start != nullptr) { 1919 if (start > tmpLocaleID.data()) { 1920 tmpLocaleID.remove_prefix(start - tmpLocaleID.data()); 1921 } 1922 keywordAssign = tmpLocaleID.find('='); 1923 separatorIndicator = tmpLocaleID.find(';'); 1924 } else { 1925 tmpLocaleID = {}; 1926 } 1927 1928 /* Copy POSIX-style variant, if any [mr@FOO] */ 1929 if (!OPTION_SET(options, _ULOC_CANONICALIZE) && 1930 !tmpLocaleID.empty() && keywordAssign == std::string_view::npos) { 1931 tag.append(tmpLocaleID, err); 1932 tmpLocaleID = {}; 1933 } 1934 1935 if (OPTION_SET(options, _ULOC_CANONICALIZE)) { 1936 /* Handle @FOO variant if @ is present and not followed by = */ 1937 if (!tmpLocaleID.empty() && keywordAssign == std::string_view::npos) { 1938 /* Add missing '_' if needed */ 1939 if (fieldCount < 2 || (fieldCount < 3 && !script.isEmpty())) { 1940 do { 1941 tag.append('_', err); 1942 ++fieldCount; 1943 } while(fieldCount<2); 1944 } 1945 1946 CharStringByteSink s(&tag); 1947 std::string_view sub = tmpLocaleID; 1948 sub.remove_prefix(1); 1949 _getVariant(sub, '@', &s, !variant.isEmpty(), err); 1950 if (U_FAILURE(err)) { return; } 1951 } 1952 1953 /* Look up the ID in the canonicalization map */ 1954 for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) { 1955 StringPiece id(CANONICALIZE_MAP[j].id); 1956 if (tag == id) { 1957 if (id.empty() && !tmpLocaleID.empty()) { 1958 break; /* Don't remap "" if keywords present */ 1959 } 1960 tag.clear(); 1961 tag.append(CANONICALIZE_MAP[j].canonicalID, err); 1962 break; 1963 } 1964 } 1965 } 1966 1967 sink.Append(tag.data(), tag.length()); 1968 1969 if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) { 1970 if (!tmpLocaleID.empty() && keywordAssign != std::string_view::npos && 1971 (separatorIndicator == std::string_view::npos || separatorIndicator > keywordAssign)) { 1972 sink.Append("@", 1); 1973 ++fieldCount; 1974 tmpLocaleID.remove_prefix(1); 1975 ulocimp_getKeywords(tmpLocaleID, '@', sink, true, err); 1976 } 1977 } 1978 } 1979 1980 } // namespace 1981 1982 /* ### ID parsing API **************************************************/ 1983 1984 U_CAPI int32_t U_EXPORT2 1985 uloc_getParent(const char* localeID, 1986 char* parent, 1987 int32_t parentCapacity, 1988 UErrorCode* err) 1989 { 1990 return ByteSinkUtil::viaByteSinkToTerminatedChars( 1991 parent, parentCapacity, 1992 [&](ByteSink& sink, UErrorCode& status) { 1993 ulocimp_getParent(localeID, sink, status); 1994 }, 1995 *err); 1996 } 1997 1998 U_EXPORT CharString 1999 ulocimp_getParent(const char* localeID, 2000 UErrorCode& err) 2001 { 2002 return ByteSinkUtil::viaByteSinkToCharString( 2003 [&](ByteSink& sink, UErrorCode& status) { 2004 ulocimp_getParent(localeID, sink, status); 2005 }, 2006 err); 2007 } 2008 2009 U_EXPORT void 2010 ulocimp_getParent(const char* localeID, 2011 icu::ByteSink& sink, 2012 UErrorCode& err) 2013 { 2014 if (U_FAILURE(err)) { return; } 2015 2016 const char *lastUnderscore; 2017 int32_t i; 2018 2019 if (localeID == nullptr) 2020 localeID = uloc_getDefault(); 2021 2022 lastUnderscore=uprv_strrchr(localeID, '_'); 2023 if(lastUnderscore!=nullptr) { 2024 i = static_cast<int32_t>(lastUnderscore - localeID); 2025 } else { 2026 i=0; 2027 } 2028 2029 if (i > 0) { 2030 if (uprv_strnicmp(localeID, "und_", 4) == 0) { 2031 localeID += 3; 2032 i -= 3; 2033 } 2034 sink.Append(localeID, i); 2035 } 2036 } 2037 2038 U_CAPI int32_t U_EXPORT2 2039 uloc_getLanguage(const char* localeID, 2040 char* language, 2041 int32_t languageCapacity, 2042 UErrorCode* err) 2043 { 2044 if (localeID == nullptr) { 2045 localeID = uloc_getDefault(); 2046 } 2047 2048 /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/ 2049 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2050 language, languageCapacity, 2051 [&](ByteSink& sink, UErrorCode& status) { 2052 ulocimp_getSubtags( 2053 localeID, 2054 &sink, 2055 nullptr, 2056 nullptr, 2057 nullptr, 2058 nullptr, 2059 status); 2060 }, 2061 *err); 2062 } 2063 2064 U_CAPI int32_t U_EXPORT2 2065 uloc_getScript(const char* localeID, 2066 char* script, 2067 int32_t scriptCapacity, 2068 UErrorCode* err) 2069 { 2070 if (localeID == nullptr) { 2071 localeID = uloc_getDefault(); 2072 } 2073 2074 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2075 script, scriptCapacity, 2076 [&](ByteSink& sink, UErrorCode& status) { 2077 ulocimp_getSubtags( 2078 localeID, 2079 nullptr, 2080 &sink, 2081 nullptr, 2082 nullptr, 2083 nullptr, 2084 status); 2085 }, 2086 *err); 2087 } 2088 2089 U_CAPI int32_t U_EXPORT2 2090 uloc_getCountry(const char* localeID, 2091 char* country, 2092 int32_t countryCapacity, 2093 UErrorCode* err) 2094 { 2095 if (localeID == nullptr) { 2096 localeID = uloc_getDefault(); 2097 } 2098 2099 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2100 country, countryCapacity, 2101 [&](ByteSink& sink, UErrorCode& status) { 2102 ulocimp_getSubtags( 2103 localeID, 2104 nullptr, 2105 nullptr, 2106 &sink, 2107 nullptr, 2108 nullptr, 2109 status); 2110 }, 2111 *err); 2112 } 2113 2114 U_CAPI int32_t U_EXPORT2 2115 uloc_getVariant(const char* localeID, 2116 char* variant, 2117 int32_t variantCapacity, 2118 UErrorCode* err) 2119 { 2120 if (localeID == nullptr) { 2121 localeID = uloc_getDefault(); 2122 } 2123 2124 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2125 variant, variantCapacity, 2126 [&](ByteSink& sink, UErrorCode& status) { 2127 ulocimp_getSubtags( 2128 localeID, 2129 nullptr, 2130 nullptr, 2131 nullptr, 2132 &sink, 2133 nullptr, 2134 status); 2135 }, 2136 *err); 2137 } 2138 2139 U_CAPI int32_t U_EXPORT2 2140 uloc_getName(const char* localeID, 2141 char* name, 2142 int32_t nameCapacity, 2143 UErrorCode* err) 2144 { 2145 if (localeID == nullptr) { 2146 localeID = uloc_getDefault(); 2147 } 2148 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2149 name, nameCapacity, 2150 [&](ByteSink& sink, UErrorCode& status) { 2151 ulocimp_getName(localeID, sink, status); 2152 }, 2153 *err); 2154 } 2155 2156 U_EXPORT CharString 2157 ulocimp_getName(std::string_view localeID, 2158 UErrorCode& err) 2159 { 2160 return ByteSinkUtil::viaByteSinkToCharString( 2161 [&](ByteSink& sink, UErrorCode& status) { 2162 ulocimp_getName(localeID, sink, status); 2163 }, 2164 err); 2165 } 2166 2167 U_EXPORT void 2168 ulocimp_getName(std::string_view localeID, 2169 ByteSink& sink, 2170 UErrorCode& err) 2171 { 2172 _canonicalize(localeID, sink, 0, err); 2173 } 2174 2175 U_CAPI int32_t U_EXPORT2 2176 uloc_getBaseName(const char* localeID, 2177 char* name, 2178 int32_t nameCapacity, 2179 UErrorCode* err) 2180 { 2181 if (localeID == nullptr) { 2182 localeID = uloc_getDefault(); 2183 } 2184 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2185 name, nameCapacity, 2186 [&](ByteSink& sink, UErrorCode& status) { 2187 ulocimp_getBaseName(localeID, sink, status); 2188 }, 2189 *err); 2190 } 2191 2192 U_EXPORT CharString 2193 ulocimp_getBaseName(std::string_view localeID, 2194 UErrorCode& err) 2195 { 2196 return ByteSinkUtil::viaByteSinkToCharString( 2197 [&](ByteSink& sink, UErrorCode& status) { 2198 ulocimp_getBaseName(localeID, sink, status); 2199 }, 2200 err); 2201 } 2202 2203 U_EXPORT void 2204 ulocimp_getBaseName(std::string_view localeID, 2205 ByteSink& sink, 2206 UErrorCode& err) 2207 { 2208 _canonicalize(localeID, sink, _ULOC_STRIP_KEYWORDS, err); 2209 } 2210 2211 U_CAPI int32_t U_EXPORT2 2212 uloc_canonicalize(const char* localeID, 2213 char* name, 2214 int32_t nameCapacity, 2215 UErrorCode* err) 2216 { 2217 if (localeID == nullptr) { 2218 localeID = uloc_getDefault(); 2219 } 2220 return ByteSinkUtil::viaByteSinkToTerminatedChars( 2221 name, nameCapacity, 2222 [&](ByteSink& sink, UErrorCode& status) { 2223 ulocimp_canonicalize(localeID, sink, status); 2224 }, 2225 *err); 2226 } 2227 2228 U_EXPORT CharString 2229 ulocimp_canonicalize(std::string_view localeID, 2230 UErrorCode& err) 2231 { 2232 return ByteSinkUtil::viaByteSinkToCharString( 2233 [&](ByteSink& sink, UErrorCode& status) { 2234 ulocimp_canonicalize(localeID, sink, status); 2235 }, 2236 err); 2237 } 2238 2239 U_EXPORT void 2240 ulocimp_canonicalize(std::string_view localeID, 2241 ByteSink& sink, 2242 UErrorCode& err) 2243 { 2244 _canonicalize(localeID, sink, _ULOC_CANONICALIZE, err); 2245 } 2246 2247 U_CAPI const char* U_EXPORT2 2248 uloc_getISO3Language(const char* localeID) 2249 { 2250 UErrorCode err = U_ZERO_ERROR; 2251 2252 if (localeID == nullptr) 2253 { 2254 localeID = uloc_getDefault(); 2255 } 2256 CharString lang = ulocimp_getLanguage(localeID, err); 2257 if (U_FAILURE(err)) 2258 return ""; 2259 std::optional<int16_t> offset = _findIndex(LANGUAGES, lang.data()); 2260 return offset.has_value() ? LANGUAGES_3[*offset] : ""; 2261 } 2262 2263 U_CAPI const char* U_EXPORT2 2264 uloc_getISO3Country(const char* localeID) 2265 { 2266 UErrorCode err = U_ZERO_ERROR; 2267 2268 if (localeID == nullptr) 2269 { 2270 localeID = uloc_getDefault(); 2271 } 2272 CharString cntry = ulocimp_getRegion(localeID, err); 2273 if (U_FAILURE(err)) 2274 return ""; 2275 std::optional<int16_t> offset = _findIndex(COUNTRIES, cntry.data()); 2276 return offset.has_value() ? COUNTRIES_3[*offset] : ""; 2277 } 2278 2279 U_CAPI uint32_t U_EXPORT2 2280 uloc_getLCID(const char* localeID) 2281 { 2282 UErrorCode status = U_ZERO_ERROR; 2283 uint32_t lcid = 0; 2284 2285 /* Check for incomplete id. */ 2286 if (!localeID || uprv_strlen(localeID) < 2) { 2287 return 0; 2288 } 2289 2290 // First, attempt Windows platform lookup if available, but fall 2291 // through to catch any special cases (ICU vs Windows name differences). 2292 lcid = uprv_convertToLCIDPlatform(localeID, &status); 2293 if (U_FAILURE(status)) { 2294 return 0; 2295 } 2296 if (lcid > 0) { 2297 // Windows found an LCID, return that 2298 return lcid; 2299 } 2300 2301 CharString langID = ulocimp_getLanguage(localeID, status); 2302 if (U_FAILURE(status)) { 2303 return 0; 2304 } 2305 2306 if (uprv_strchr(localeID, '@')) { 2307 // uprv_convertToLCID does not support keywords other than collation. 2308 // Remove all keywords except collation. 2309 CharString collVal = ulocimp_getKeywordValue(localeID, "collation", status); 2310 if (U_SUCCESS(status) && !collVal.isEmpty()) { 2311 CharString tmpLocaleID = ulocimp_getBaseName(localeID, status); 2312 ulocimp_setKeywordValue("collation", collVal.toStringPiece(), tmpLocaleID, status); 2313 if (U_SUCCESS(status)) { 2314 return uprv_convertToLCID(langID.data(), tmpLocaleID.data(), &status); 2315 } 2316 } 2317 2318 // fall through - all keywords are simply ignored 2319 status = U_ZERO_ERROR; 2320 } 2321 2322 return uprv_convertToLCID(langID.data(), localeID, &status); 2323 } 2324 2325 U_CAPI int32_t U_EXPORT2 2326 uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity, 2327 UErrorCode *status) 2328 { 2329 return uprv_convertToPosix(hostid, locale, localeCapacity, status); 2330 } 2331 2332 /* ### Default locale **************************************************/ 2333 2334 U_CAPI const char* U_EXPORT2 2335 uloc_getDefault() 2336 { 2337 return locale_get_default(); 2338 } 2339 2340 U_CAPI void U_EXPORT2 2341 uloc_setDefault(const char* newDefaultLocale, 2342 UErrorCode* err) 2343 { 2344 if (U_FAILURE(*err)) 2345 return; 2346 /* the error code isn't currently used for anything by this function*/ 2347 2348 /* propagate change to C++ */ 2349 locale_set_default(newDefaultLocale); 2350 } 2351 2352 /** 2353 * Returns a list of all 2-letter language codes defined in ISO 639. This is a pointer 2354 * to an array of pointers to arrays of char. All of these pointers are owned 2355 * by ICU-- do not delete them, and do not write through them. The array is 2356 * terminated with a null pointer. 2357 */ 2358 U_CAPI const char* const* U_EXPORT2 2359 uloc_getISOLanguages() 2360 { 2361 return LANGUAGES; 2362 } 2363 2364 /** 2365 * Returns a list of all 2-letter country codes defined in ISO 639. This is a 2366 * pointer to an array of pointers to arrays of char. All of these pointers are 2367 * owned by ICU-- do not delete them, and do not write through them. The array is 2368 * terminated with a null pointer. 2369 */ 2370 U_CAPI const char* const* U_EXPORT2 2371 uloc_getISOCountries() 2372 { 2373 return COUNTRIES; 2374 } 2375 2376 U_CAPI const char* U_EXPORT2 2377 uloc_toUnicodeLocaleKey(const char* keyword) 2378 { 2379 if (keyword == nullptr || *keyword == '\0') { return nullptr; } 2380 std::optional<std::string_view> result = ulocimp_toBcpKeyWithFallback(keyword); 2381 return result.has_value() ? result->data() : nullptr; // Known to be NUL terminated. 2382 } 2383 2384 U_EXPORT std::optional<std::string_view> 2385 ulocimp_toBcpKeyWithFallback(std::string_view keyword) 2386 { 2387 std::optional<std::string_view> bcpKey = ulocimp_toBcpKey(keyword); 2388 if (!bcpKey.has_value() && 2389 ultag_isUnicodeLocaleKey(keyword.data(), static_cast<int32_t>(keyword.size()))) { 2390 // unknown keyword, but syntax is fine.. 2391 return keyword; 2392 } 2393 return bcpKey; 2394 } 2395 2396 U_CAPI const char* U_EXPORT2 2397 uloc_toUnicodeLocaleType(const char* keyword, const char* value) 2398 { 2399 if (keyword == nullptr || *keyword == '\0' || 2400 value == nullptr || *value == '\0') { return nullptr; } 2401 std::optional<std::string_view> result = ulocimp_toBcpTypeWithFallback(keyword, value); 2402 return result.has_value() ? result->data() : nullptr; // Known to be NUL terminated. 2403 } 2404 2405 U_EXPORT std::optional<std::string_view> 2406 ulocimp_toBcpTypeWithFallback(std::string_view keyword, std::string_view value) 2407 { 2408 std::optional<std::string_view> bcpType = ulocimp_toBcpType(keyword, value); 2409 if (!bcpType.has_value() && 2410 ultag_isUnicodeLocaleType(value.data(), static_cast<int32_t>(value.size()))) { 2411 // unknown keyword, but syntax is fine.. 2412 return value; 2413 } 2414 return bcpType; 2415 } 2416 2417 namespace { 2418 2419 bool 2420 isWellFormedLegacyKey(std::string_view key) 2421 { 2422 return std::all_of(key.begin(), key.end(), UPRV_ISALPHANUM); 2423 } 2424 2425 bool 2426 isWellFormedLegacyType(std::string_view legacyType) 2427 { 2428 int32_t alphaNumLen = 0; 2429 for (char c : legacyType) { 2430 if (c == '_' || c == '/' || c == '-') { 2431 if (alphaNumLen == 0) { 2432 return false; 2433 } 2434 alphaNumLen = 0; 2435 } else if (UPRV_ISALPHANUM(c)) { 2436 alphaNumLen++; 2437 } else { 2438 return false; 2439 } 2440 } 2441 return alphaNumLen != 0; 2442 } 2443 2444 } // namespace 2445 2446 U_CAPI const char* U_EXPORT2 2447 uloc_toLegacyKey(const char* keyword) 2448 { 2449 if (keyword == nullptr || *keyword == '\0') { return nullptr; } 2450 std::optional<std::string_view> result = ulocimp_toLegacyKeyWithFallback(keyword); 2451 return result.has_value() ? result->data() : nullptr; // Known to be NUL terminated. 2452 } 2453 2454 U_EXPORT std::optional<std::string_view> 2455 ulocimp_toLegacyKeyWithFallback(std::string_view keyword) 2456 { 2457 std::optional<std::string_view> legacyKey = ulocimp_toLegacyKey(keyword); 2458 if (!legacyKey.has_value() && isWellFormedLegacyKey(keyword)) { 2459 // Checks if the specified locale key is well-formed with the legacy locale syntax. 2460 // 2461 // Note: 2462 // LDML/CLDR provides some definition of keyword syntax in 2463 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and 2464 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax 2465 // Keys can only consist of [0-9a-zA-Z]. 2466 return keyword; 2467 } 2468 return legacyKey; 2469 } 2470 2471 U_CAPI const char* U_EXPORT2 2472 uloc_toLegacyType(const char* keyword, const char* value) 2473 { 2474 if (keyword == nullptr || *keyword == '\0' || 2475 value == nullptr || *value == '\0') { return nullptr; } 2476 std::optional<std::string_view> result = ulocimp_toLegacyTypeWithFallback(keyword, value); 2477 return result.has_value() ? result->data() : nullptr; // Known to be NUL terminated. 2478 } 2479 2480 U_EXPORT std::optional<std::string_view> 2481 ulocimp_toLegacyTypeWithFallback(std::string_view keyword, std::string_view value) 2482 { 2483 std::optional<std::string_view> legacyType = ulocimp_toLegacyType(keyword, value); 2484 if (!legacyType.has_value() && isWellFormedLegacyType(value)) { 2485 // Checks if the specified locale type is well-formed with the legacy locale syntax. 2486 // 2487 // Note: 2488 // LDML/CLDR provides some definition of keyword syntax in 2489 // * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier and 2490 // * http://www.unicode.org/reports/tr35/#Old_Locale_Extension_Syntax 2491 // Values (types) can only consist of [0-9a-zA-Z], plus for legacy values 2492 // we allow [/_-+] in the middle (e.g. "Etc/GMT+1", "Asia/Tel_Aviv") 2493 return value; 2494 } 2495 return legacyType; 2496 } 2497 2498 /*eof*/