make_intl_data.py (134274B)
1 #!/usr/bin/env python 2 # 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 """Usage: 8 make_intl_data.py langtags [cldr_common.zip] 9 make_intl_data.py tzdata 10 make_intl_data.py currency 11 make_intl_data.py units 12 make_intl_data.py numbering 13 14 15 Target "langtags": 16 This script extracts information about 1) mappings between deprecated and 17 current Unicode BCP 47 locale identifiers, and 2) deprecated and current 18 BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping 19 code in intl/components/LocaleGenerated.cpp. The code is used in 20 intl/components/Locale.cpp. 21 22 23 Target "tzdata": 24 This script computes which time zone informations are not up-to-date in ICU 25 and provides the necessary mappings to workaround this problem. 26 https://ssl.icu-project.org/trac/ticket/12044 27 28 29 Target "currency": 30 Generates the mapping from currency codes to decimal digits used for them. 31 32 33 Target "units": 34 Generate source and test files using the list of so-called "sanctioned unit 35 identifiers" and verifies that the ICU data filter includes these units. 36 37 38 Target "numbering": 39 Generate source and test files using the list of numbering systems with 40 simple digit mappings and verifies that it's in sync with ICU/CLDR. 41 """ 42 43 import io 44 import json 45 import os 46 import re 47 import tarfile 48 import tempfile 49 from contextlib import closing 50 from functools import partial, total_ordering 51 from itertools import chain, filterfalse, groupby, tee, zip_longest 52 from operator import attrgetter, itemgetter 53 from urllib.parse import urlsplit 54 from urllib.request import Request as UrlRequest 55 from urllib.request import urlopen 56 from zipfile import ZipFile 57 58 import yaml 59 60 61 # From https://docs.python.org/3/library/itertools.html 62 def grouper(iterable, n, fillvalue=None): 63 "Collect data into fixed-length chunks or blocks" 64 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" 65 args = [iter(iterable)] * n 66 return zip_longest(*args, fillvalue=fillvalue) 67 68 69 def writeMappingHeader(println, description, source, url): 70 if type(description) is not list: 71 description = [description] 72 for desc in description: 73 println(f"// {desc}") 74 println(f"// Derived from {source}.") 75 println(f"// {url}") 76 77 78 def writeMappingsVar(println, mapping, name, description, source, url): 79 """Writes a variable definition with a mapping table. 80 81 Writes the contents of dictionary |mapping| through the |println| 82 function with the given variable name and a comment with description, 83 fileDate, and URL. 84 """ 85 println("") 86 writeMappingHeader(println, description, source, url) 87 println(f"var {name} = {{") 88 for key, value in sorted(mapping.items(), key=itemgetter(0)): 89 println(f' "{key}": "{value}",') 90 println("};") 91 92 93 def writeMappingsBinarySearch( 94 println, 95 fn_name, 96 type_name, 97 name, 98 validate_fn, 99 validate_case_fn, 100 mappings, 101 tag_maxlength, 102 description, 103 source, 104 url, 105 ): 106 """Emit code to perform a binary search on language tag subtags. 107 108 Uses the contents of |mapping|, which can either be a dictionary or set, 109 to emit a mapping function to find subtag replacements. 110 """ 111 println("") 112 writeMappingHeader(println, description, source, url) 113 println( 114 f""" 115 bool mozilla::intl::Locale::{fn_name}({type_name} {name}) {{ 116 MOZ_ASSERT({validate_fn}({name}.Span())); 117 MOZ_ASSERT({validate_case_fn}({name}.Span())); 118 """.strip() 119 ) 120 writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength) 121 122 println( 123 """ 124 }""".lstrip("\n") 125 ) 126 127 128 def writeMappingsBinarySearchBody( 129 println, source_name, target_name, mappings, tag_maxlength 130 ): 131 def write_array(subtags, name, length, fixed): 132 if fixed: 133 println(f" static const char {name}[{len(subtags)}][{length + 1}] = {{") 134 else: 135 println(f" static const char* {name}[{len(subtags)}] = {{") 136 137 # Group in pairs of ten to not exceed the 80 line column limit. 138 for entries in grouper(subtags, 10): 139 entries = ( 140 f'"{tag}"'.rjust(length + 2) for tag in entries if tag is not None 141 ) 142 println(" {},".format(", ".join(entries))) 143 144 println(" };") 145 146 trailing_return = True 147 148 # Sort the subtags by length. That enables using an optimized comparator 149 # for the binary search, which only performs a single |memcmp| for multiple 150 # of two subtag lengths. 151 mappings_keys = mappings.keys() if type(mappings) is dict else mappings 152 for length, subtags in groupby(sorted(mappings_keys, key=len), len): 153 # Omit the length check if the current length is the maximum length. 154 if length != tag_maxlength: 155 println( 156 f""" 157 if ({source_name}.Length() == {length}) {{ 158 """.rstrip("\n") 159 ) 160 else: 161 trailing_return = False 162 println( 163 """ 164 { 165 """.rstrip("\n") 166 ) 167 168 # The subtags need to be sorted for binary search to work. 169 subtags = sorted(subtags) 170 171 def equals(subtag): 172 return f"""{source_name}.EqualTo("{subtag}")""" 173 174 # Don't emit a binary search for short lists. 175 if len(subtags) == 1: 176 if type(mappings) is dict: 177 println( 178 f""" 179 if ({equals(subtags[0])}) {{ 180 {target_name}.Set(mozilla::MakeStringSpan("{mappings[subtags[0]]}")); 181 return true; 182 }} 183 return false; 184 """.strip("\n") 185 ) 186 else: 187 println( 188 f""" 189 return {equals(subtags[0])}; 190 """.strip("\n") 191 ) 192 elif len(subtags) <= 4: 193 if type(mappings) is dict: 194 for subtag in subtags: 195 println( 196 f""" 197 if ({equals(subtag)}) {{ 198 {target_name}.Set("{mappings[subtag]}"); 199 return true; 200 }} 201 """.strip("\n") 202 ) 203 204 println( 205 """ 206 return false; 207 """.strip("\n") 208 ) 209 else: 210 cond = (equals(subtag) for subtag in subtags) 211 cond = (" ||\n" + " " * (4 + len("return "))).join(cond) 212 println( 213 f""" 214 return {cond}; 215 """.strip("\n") 216 ) 217 else: 218 write_array(subtags, source_name + "s", length, True) 219 220 if type(mappings) is dict: 221 write_array([mappings[k] for k in subtags], "aliases", length, False) 222 223 println( 224 f""" 225 if (const char* replacement = SearchReplacement({source_name}s, aliases, {source_name})) {{ 226 {target_name}.Set(mozilla::MakeStringSpan(replacement)); 227 return true; 228 }} 229 return false; 230 """.rstrip() 231 ) 232 else: 233 println( 234 f""" 235 return HasReplacement({source_name}s, {source_name}); 236 """.rstrip() 237 ) 238 239 println( 240 """ 241 } 242 """.strip("\n") 243 ) 244 245 if trailing_return: 246 println( 247 """ 248 return false;""" 249 ) 250 251 252 def writeComplexLanguageTagMappings( 253 println, complex_language_mappings, description, source, url 254 ): 255 println("") 256 writeMappingHeader(println, description, source, url) 257 println( 258 """ 259 void mozilla::intl::Locale::PerformComplexLanguageMappings() { 260 MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); 261 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 262 """.lstrip() 263 ) 264 265 # Merge duplicate language entries. 266 language_aliases = {} 267 for deprecated_language, (language, script, region) in sorted( 268 complex_language_mappings.items(), key=itemgetter(0) 269 ): 270 key = (language, script, region) 271 if key not in language_aliases: 272 language_aliases[key] = [] 273 else: 274 language_aliases[key].append(deprecated_language) 275 276 first_language = True 277 for deprecated_language, (language, script, region) in sorted( 278 complex_language_mappings.items(), key=itemgetter(0) 279 ): 280 key = (language, script, region) 281 if deprecated_language in language_aliases[key]: 282 continue 283 284 if_kind = "if" if first_language else "else if" 285 first_language = False 286 287 cond = ( 288 f'Language().EqualTo("{lang}")' 289 for lang in [deprecated_language] + language_aliases[key] 290 ) 291 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 292 293 println( 294 f""" 295 {if_kind} ({cond}) {{""".strip("\n") 296 ) 297 298 println( 299 f""" 300 SetLanguage("{language}");""".strip("\n") 301 ) 302 303 if script is not None: 304 println( 305 f""" 306 if (Script().Missing()) {{ 307 SetScript("{script}"); 308 }}""".strip("\n") 309 ) 310 if region is not None: 311 println( 312 f""" 313 if (Region().Missing()) {{ 314 SetRegion("{region}"); 315 }}""".strip("\n") 316 ) 317 println( 318 """ 319 }""".strip("\n") 320 ) 321 322 println( 323 """ 324 } 325 """.strip("\n") 326 ) 327 328 329 def writeComplexRegionTagMappings( 330 println, complex_region_mappings, description, source, url 331 ): 332 println("") 333 writeMappingHeader(println, description, source, url) 334 println( 335 """ 336 void mozilla::intl::Locale::PerformComplexRegionMappings() { 337 MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); 338 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 339 MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span())); 340 MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span())); 341 """.lstrip() 342 ) 343 344 # |non_default_replacements| is a list and hence not hashable. Convert it 345 # to a string to get a proper hashable value. 346 def hash_key(default, non_default_replacements): 347 return (default, str(sorted(str(v) for v in non_default_replacements))) 348 349 # Merge duplicate region entries. 350 region_aliases = {} 351 for deprecated_region, (default, non_default_replacements) in sorted( 352 complex_region_mappings.items(), key=itemgetter(0) 353 ): 354 key = hash_key(default, non_default_replacements) 355 if key not in region_aliases: 356 region_aliases[key] = [] 357 else: 358 region_aliases[key].append(deprecated_region) 359 360 first_region = True 361 for deprecated_region, (default, non_default_replacements) in sorted( 362 complex_region_mappings.items(), key=itemgetter(0) 363 ): 364 key = hash_key(default, non_default_replacements) 365 if deprecated_region in region_aliases[key]: 366 continue 367 368 if_kind = "if" if first_region else "else if" 369 first_region = False 370 371 cond = ( 372 f'Region().EqualTo("{region}")' 373 for region in [deprecated_region] + region_aliases[key] 374 ) 375 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 376 377 println( 378 f""" 379 {if_kind} ({cond}) {{""".strip("\n") 380 ) 381 382 replacement_regions = sorted({ 383 region for (_, _, region) in non_default_replacements 384 }) 385 386 first_case = True 387 for replacement_region in replacement_regions: 388 replacement_language_script = sorted( 389 (language, script) 390 for (language, script, region) in (non_default_replacements) 391 if region == replacement_region 392 ) 393 394 if_kind = "if" if first_case else "else if" 395 first_case = False 396 397 def compare_tags(language, script): 398 if script is None: 399 return f'Language().EqualTo("{language}")' 400 return f'(Language().EqualTo("{language}") && Script().EqualTo("{script}"))' 401 402 cond = ( 403 compare_tags(language, script) 404 for (language, script) in replacement_language_script 405 ) 406 cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond) 407 408 println( 409 f""" 410 {if_kind} ({cond}) {{ 411 SetRegion("{replacement_region}"); 412 }}""".rstrip().strip("\n") 413 ) 414 415 println( 416 f""" 417 else {{ 418 SetRegion("{default}"); 419 }} 420 }}""".rstrip().strip("\n") 421 ) 422 423 println( 424 """ 425 } 426 """.strip("\n") 427 ) 428 429 430 def writeVariantTagMappings(println, variant_mappings, description, source, url): 431 """Writes a function definition that maps variant subtags.""" 432 println( 433 """ 434 static auto ToSpan(const mozilla::Span<const char>& aSpan) { 435 return aSpan; 436 } 437 438 template <size_t N> 439 static auto ToSpan(const mozilla::intl::LanguageTagSubtag<N>& aSubtag) { 440 return aSubtag.Span(); 441 } 442 443 template <typename T, typename U = T> 444 static bool IsLessThan(const T& a, const U& b) { 445 return ToSpan(a) < ToSpan(b); 446 } 447 """ 448 ) 449 writeMappingHeader(println, description, source, url) 450 println( 451 """ 452 bool mozilla::intl::Locale::PerformVariantMappings() { 453 // The variant subtags need to be sorted for binary search. 454 MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), 455 IsLessThan<decltype(mVariants)::ElementType>)); 456 457 auto removeVariantAt = [&](size_t index) { 458 mVariants.erase(mVariants.begin() + index); 459 }; 460 461 auto insertVariantSortedIfNotPresent = [&](mozilla::Span<const char> variant) { 462 auto* p = std::lower_bound( 463 mVariants.begin(), mVariants.end(), variant, 464 IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>); 465 466 // Don't insert the replacement when already present. 467 if (p != mVariants.end() && p->Span() == variant) { 468 return true; 469 } 470 471 // Insert the preferred variant in sort order. 472 auto preferred = mozilla::intl::VariantSubtag{variant}; 473 return !!mVariants.insert(p, preferred); 474 }; 475 476 for (size_t i = 0; i < mVariants.length();) { 477 const auto& variant = mVariants[i]; 478 MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant.Span())); 479 """.lstrip() 480 ) 481 482 (no_alias, with_alias) = partition( 483 variant_mappings.items(), lambda item: item[1] is None 484 ) 485 486 no_replacements = " ||\n ".join( 487 f"""variant.Span() == mozilla::MakeStringSpan("{deprecated_variant}")""" 488 for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0)) 489 ) 490 491 println( 492 f""" 493 if ({no_replacements}) {{ 494 removeVariantAt(i); 495 }} 496 """.strip("\n") 497 ) 498 499 for deprecated_variant, (type, replacement) in sorted( 500 with_alias, key=itemgetter(0) 501 ): 502 println( 503 f""" 504 else if (variant.Span() == mozilla::MakeStringSpan("{deprecated_variant}")) {{ 505 removeVariantAt(i); 506 """.strip("\n") 507 ) 508 509 if type == "language": 510 println( 511 f""" 512 SetLanguage("{replacement}"); 513 """.strip("\n") 514 ) 515 elif type == "region": 516 println( 517 f""" 518 SetRegion("{replacement}"); 519 """.strip("\n") 520 ) 521 else: 522 assert type == "variant" 523 println( 524 f""" 525 if (!insertVariantSortedIfNotPresent(mozilla::MakeStringSpan("{replacement}"))) {{ 526 return false; 527 }} 528 """.strip("\n") 529 ) 530 531 println( 532 """ 533 } 534 """.strip("\n") 535 ) 536 537 println( 538 """ 539 else { 540 i++; 541 } 542 } 543 return true; 544 } 545 """.strip("\n") 546 ) 547 548 549 def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url): 550 """Writes a function definition that maps legacy language tags.""" 551 println("") 552 writeMappingHeader(println, description, source, url) 553 println( 554 """\ 555 bool mozilla::intl::Locale::UpdateLegacyMappings() { 556 // We're mapping legacy tags to non-legacy form here. 557 // Other tags remain unchanged. 558 // 559 // Legacy tags are either sign language tags ("sgn") or have one or multiple 560 // variant subtags. Therefore we can quickly exclude most tags by checking 561 // these two subtags. 562 563 MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span())); 564 565 if (!Language().EqualTo("sgn") && mVariants.length() == 0) { 566 return true; 567 } 568 569 #ifdef DEBUG 570 for (const auto& variant : Variants()) { 571 MOZ_ASSERT(IsStructurallyValidVariantTag(variant)); 572 MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant)); 573 } 574 #endif 575 576 // The variant subtags need to be sorted for binary search. 577 MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(), 578 IsLessThan<decltype(mVariants)::ElementType>)); 579 580 auto findVariant = [this](mozilla::Span<const char> variant) { 581 auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, 582 IsLessThan<decltype(mVariants)::ElementType, 583 decltype(variant)>); 584 585 if (p != mVariants.end() && p->Span() == variant) { 586 return p; 587 } 588 return static_cast<decltype(p)>(nullptr); 589 }; 590 591 auto insertVariantSortedIfNotPresent = [&](mozilla::Span<const char> variant) { 592 auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant, 593 IsLessThan<decltype(mVariants)::ElementType, 594 decltype(variant)>); 595 596 // Don't insert the replacement when already present. 597 if (p != mVariants.end() && p->Span() == variant) { 598 return true; 599 } 600 601 // Insert the preferred variant in sort order. 602 auto preferred = mozilla::intl::VariantSubtag{variant}; 603 return !!mVariants.insert(p, preferred); 604 }; 605 606 auto removeVariant = [&](auto* p) { 607 size_t index = std::distance(mVariants.begin(), p); 608 mVariants.erase(mVariants.begin() + index); 609 }; 610 611 auto removeVariants = [&](auto* p, auto* q) { 612 size_t pIndex = std::distance(mVariants.begin(), p); 613 size_t qIndex = std::distance(mVariants.begin(), q); 614 MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted"); 615 616 mVariants.erase(mVariants.begin() + qIndex); 617 mVariants.erase(mVariants.begin() + pIndex); 618 };""" 619 ) 620 621 # Helper class for pattern matching. 622 class AnyClass: 623 def __eq__(self, obj): 624 return obj is not None 625 626 Any = AnyClass() 627 628 # Group the mappings by language. 629 legacy_mappings_by_language = {} 630 for type, replacement in legacy_mappings.items(): 631 (language, _, _, _) = type 632 legacy_mappings_by_language.setdefault(language, {})[type] = replacement 633 634 # Handle the empty language case first. 635 if None in legacy_mappings_by_language: 636 # Get the mappings and remove them from the dict. 637 mappings = legacy_mappings_by_language.pop(None) 638 639 # This case only applies for the "hepburn-heploc" -> "alalc97" 640 # mapping, so just inline it here. 641 from_tag = (None, None, None, "hepburn-heploc") 642 to_tag = (None, None, None, "alalc97") 643 644 assert len(mappings) == 1 645 assert mappings[from_tag] == to_tag 646 647 println( 648 """ 649 if (mVariants.length() >= 2) { 650 if (auto* hepburn = findVariant(mozilla::MakeStringSpan("hepburn"))) { 651 if (auto* heploc = findVariant(mozilla::MakeStringSpan("heploc"))) { 652 removeVariants(hepburn, heploc); 653 654 if (!insertVariantSortedIfNotPresent(mozilla::MakeStringSpan("alalc97"))) { 655 return false; 656 } 657 } 658 } 659 } 660 """ 661 ) 662 663 # Handle sign languages next. 664 if "sgn" in legacy_mappings_by_language: 665 mappings = legacy_mappings_by_language.pop("sgn") 666 667 # Legacy sign language mappings have the form "sgn-XX" where "XX" is 668 # some region code. 669 assert all(type == ("sgn", None, Any, None) for type in mappings.keys()) 670 671 # Legacy sign languages are mapped to a single language subtag. 672 assert all( 673 replacement == (Any, None, None, None) for replacement in mappings.values() 674 ) 675 676 println( 677 """ 678 if (Language().EqualTo("sgn")) { 679 if (Region().Present() && SignLanguageMapping(mLanguage, Region())) { 680 mRegion.Set(mozilla::MakeStringSpan("")); 681 } 682 } 683 """.rstrip().lstrip("\n") 684 ) 685 686 # Finally handle all remaining cases. 687 688 # The remaining mappings have neither script nor region subtags in the source locale. 689 assert all( 690 type == (Any, None, None, Any) 691 for mappings in legacy_mappings_by_language.values() 692 for type in mappings.keys() 693 ) 694 695 # And they have neither script nor region nor variant subtags in the target locale. 696 assert all( 697 replacement == (Any, None, None, None) 698 for mappings in legacy_mappings_by_language.values() 699 for replacement in mappings.values() 700 ) 701 702 # Compact the mappings table by removing empty fields. 703 legacy_mappings_by_language = { 704 lang: { 705 variants: r_language 706 for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items() 707 } 708 for (lang, mappings) in legacy_mappings_by_language.items() 709 } 710 711 # Try to combine the remaining cases. 712 legacy_mappings_compact = {} 713 714 # Python can't hash dicts or lists, so use the string representation as the hash key. 715 def hash_key(mappings): 716 return str(sorted(mappings.items(), key=itemgetter(0))) 717 718 for lang, mappings in sorted( 719 legacy_mappings_by_language.items(), key=itemgetter(0) 720 ): 721 key = hash_key(mappings) 722 legacy_mappings_compact.setdefault(key, []).append(lang) 723 724 for langs in legacy_mappings_compact.values(): 725 language_equal_to = ( 726 f"""Language().EqualTo("{lang}")""" for lang in sorted(langs) 727 ) 728 cond = f""" ||\n{" " * len(" else if (")}""".join(language_equal_to) 729 730 println( 731 f""" 732 else if ({cond}) {{ 733 """.rstrip().lstrip("\n") 734 ) 735 736 mappings = legacy_mappings_by_language[langs[0]] 737 738 # Count the variant subtags to determine the sort order. 739 def variant_size(m): 740 (k, _) = m 741 return len(k.split("-")) 742 743 # Alias rules are applied by largest union size first. 744 for size, mappings_by_size in groupby( 745 sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size 746 ): 747 # Convert grouper object to dict. 748 mappings_by_size = dict(mappings_by_size) 749 750 is_first = True 751 chain_if = size == 1 752 753 # Alias rules are applied in alphabetical order 754 for variants, r_language in sorted( 755 mappings_by_size.items(), key=itemgetter(0) 756 ): 757 sorted_variants = sorted(variants.split("-")) 758 len_variants = len(sorted_variants) 759 760 maybe_else = "else " if chain_if and not is_first else "" 761 is_first = False 762 763 for i, variant in enumerate(sorted_variants): 764 println( 765 f""" 766 {" " * i}{maybe_else}if (auto* {variant} = findVariant(mozilla::MakeStringSpan("{variant}"))) {{ 767 """.rstrip().lstrip("\n") 768 ) 769 770 indent = " " * len_variants 771 772 println( 773 f""" 774 {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)}); 775 {indent}SetLanguage("{r_language}"); 776 {indent}{"return true;" if not chain_if else ""} 777 """.rstrip().lstrip("\n") 778 ) 779 780 for i in range(len_variants, 0, -1): 781 println( 782 f""" 783 {" " * (i - 1)}}} 784 """.rstrip().lstrip("\n") 785 ) 786 787 println( 788 """ 789 } 790 """.rstrip().lstrip("\n") 791 ) 792 793 println( 794 """ 795 return true; 796 }""" 797 ) 798 799 800 def writeSignLanguageMappingsFunction( 801 println, legacy_mappings, description, source, url 802 ): 803 """Writes a function definition that maps legacy sign language tags.""" 804 println("") 805 writeMappingHeader(println, description, source, url) 806 println( 807 """\ 808 bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language, 809 const RegionSubtag& region) { 810 MOZ_ASSERT(language.EqualTo("sgn")); 811 MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span())); 812 MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span())); 813 """.rstrip() 814 ) 815 816 region_mappings = { 817 rg: lg 818 for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items() 819 if lang == "sgn" 820 } 821 822 source_name = "region" 823 target_name = "language" 824 tag_maxlength = 3 825 writeMappingsBinarySearchBody( 826 println, source_name, target_name, region_mappings, tag_maxlength 827 ) 828 829 println( 830 """ 831 }""".lstrip() 832 ) 833 834 835 def readSupplementalData(core_file): 836 """Reads CLDR Supplemental Data and extracts information for Intl.js. 837 838 Information extracted: 839 - legacyMappings: mappings from legacy tags to preferred complete language tags 840 - languageMappings: mappings from language subtags to preferred subtags 841 - complexLanguageMappings: mappings from language subtags with complex rules 842 - regionMappings: mappings from region subtags to preferred subtags 843 - complexRegionMappings: mappings from region subtags with complex rules 844 - variantMappings: mappings from variant subtags to preferred subtags 845 - likelySubtags: likely subtags used for generating test data only 846 Returns these mappings as dictionaries. 847 """ 848 import xml.etree.ElementTree as ET 849 850 # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. 851 re_unicode_language_id = re.compile( 852 r""" 853 ^ 854 # unicode_language_id = unicode_language_subtag 855 # unicode_language_subtag = alpha{2,3} | alpha{5,8} 856 (?P<language>[a-z]{2,3}|[a-z]{5,8}) 857 858 # (sep unicode_script_subtag)? 859 # unicode_script_subtag = alpha{4} 860 (?:-(?P<script>[a-z]{4}))? 861 862 # (sep unicode_region_subtag)? 863 # unicode_region_subtag = (alpha{2} | digit{3}) 864 (?:-(?P<region>([a-z]{2}|[0-9]{3})))? 865 866 # (sep unicode_variant_subtag)* 867 # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) 868 (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? 869 $ 870 """, 871 re.IGNORECASE | re.VERBOSE, 872 ) 873 874 # CLDR uses "_" as the separator for some elements. Replace it with "-". 875 def bcp47_id(cldr_id): 876 return cldr_id.replace("_", "-") 877 878 # Return the tuple (language, script, region, variants) and assert all 879 # subtags are in canonical case. 880 def bcp47_canonical(language, script, region, variants): 881 # Canonical case for language subtags is lower case. 882 assert language is None or language.lower() == language 883 884 # Canonical case for script subtags is title case. 885 assert script is None or script.title() == script 886 887 # Canonical case for region subtags is upper case. 888 assert region is None or region.upper() == region 889 890 # Canonical case for variant subtags is lower case. 891 assert variants is None or variants.lower() == variants 892 893 return (language, script, region, variants[1:] if variants else None) 894 895 # Language ids are interpreted as multi-maps in 896 # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>. 897 # 898 # See UTS35, §Annex C, Definitions - 1. Multimap interpretation. 899 def language_id_to_multimap(language_id): 900 match = re_unicode_language_id.match(language_id) 901 assert match is not None, ( 902 f"{language_id} invalid Unicode BCP 47 locale identifier" 903 ) 904 905 canonical_language_id = bcp47_canonical( 906 *match.group("language", "script", "region", "variants") 907 ) 908 (language, _, _, _) = canonical_language_id 909 910 # Normalize "und" language to None, but keep the rest as is. 911 return (language if language != "und" else None,) + canonical_language_id[1:] 912 913 rules = {} 914 territory_exception_rules = {} 915 916 tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml")) 917 918 # Load the rules from supplementalMetadata.xml. 919 # 920 # See UTS35, §Annex C, Definitions - 2. Alias elements. 921 # See UTS35, §Annex C, Preprocessing. 922 for alias_name in [ 923 "languageAlias", 924 "scriptAlias", 925 "territoryAlias", 926 "variantAlias", 927 ]: 928 for alias in tree.iterfind(".//" + alias_name): 929 # Replace '_' by '-'. 930 type = bcp47_id(alias.get("type")) 931 replacement = bcp47_id(alias.get("replacement")) 932 933 # Prefix with "und-". 934 if alias_name != "languageAlias": 935 type = "und-" + type 936 937 # Discard all rules where the type is an invalid languageId. 938 if re_unicode_language_id.match(type) is None: 939 continue 940 941 type = language_id_to_multimap(type) 942 943 # Multiple, whitespace-separated territory replacements may be present. 944 if alias_name == "territoryAlias" and " " in replacement: 945 replacements = replacement.split(" ") 946 replacement_list = [ 947 language_id_to_multimap("und-" + r) for r in replacements 948 ] 949 950 assert type not in territory_exception_rules, ( 951 f"Duplicate alias rule: {type}" 952 ) 953 954 territory_exception_rules[type] = replacement_list 955 956 # The first element is the default territory replacement. 957 replacement = replacements[0] 958 959 # Prefix with "und-". 960 if alias_name != "languageAlias": 961 replacement = "und-" + replacement 962 963 replacement = language_id_to_multimap(replacement) 964 965 assert type not in rules, f"Duplicate alias rule: {type}" 966 967 rules[type] = replacement 968 969 # Helper class for pattern matching. 970 class AnyClass: 971 def __eq__(self, obj): 972 return obj is not None 973 974 Any = AnyClass() 975 976 modified_rules = True 977 loop_count = 0 978 979 while modified_rules: 980 modified_rules = False 981 loop_count += 1 982 983 # UTS 35 defines that canonicalization is applied until a fixed point has 984 # been reached. This iterative application of the canonicalization algorithm 985 # is only needed for a relatively small set of rules, so we can precompute 986 # the transitive closure of all rules here and then perform a single pass 987 # when canonicalizing language tags at runtime. 988 transitive_rules = {} 989 990 # Compute the transitive closure. 991 # Any case which currently doesn't occur in the CLDR sources isn't supported 992 # and will lead to throwing an error. 993 for type, replacement in rules.items(): 994 (language, script, region, variants) = type 995 (r_language, r_script, r_region, r_variants) = replacement 996 997 for i_type, i_replacement in rules.items(): 998 (i_language, i_script, i_region, i_variants) = i_type 999 (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement 1000 1001 if i_language is not None and i_language == r_language: 1002 # This case currently only occurs when neither script nor region 1003 # subtags are present. A single variant subtags may be present 1004 # in |type|. And |i_type| definitely has a single variant subtag. 1005 # Should this ever change, update this code accordingly. 1006 assert type in { 1007 (Any, None, None, None), 1008 (Any, None, None, Any), 1009 } 1010 assert replacement == (Any, None, None, None) 1011 assert i_type == (Any, None, None, Any) 1012 assert i_replacement == (Any, None, None, None) 1013 1014 # This case happens for the rules 1015 # "zh-guoyu -> zh", 1016 # "zh-hakka -> hak", and 1017 # "und-hakka -> und". 1018 # Given the possible input "zh-guoyu-hakka", the first rule will 1019 # change it to "zh-hakka", and then the second rule can be 1020 # applied. (The third rule isn't applied ever.) 1021 # 1022 # Let's assume there's a hypothetical rule 1023 # "zh-aaaaa" -> "en" 1024 # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en" 1025 # is applied before "zh-hakka -> hak", because rules are sorted 1026 # alphabetically. That means the overall result is "en": 1027 # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then 1028 # "hakka" is removed through the third rule. 1029 # 1030 # No current rule requires to handle this special case, so we 1031 # don't yet support it. 1032 assert variants is None or variants <= i_variants 1033 1034 # Combine all variants and remove duplicates. 1035 vars = set( 1036 i_variants.split("-") 1037 + (variants.split("-") if variants else []) 1038 ) 1039 1040 # Add the variants alphabetically sorted. 1041 n_type = (language, None, None, "-".join(sorted(vars))) 1042 1043 assert ( 1044 n_type not in transitive_rules 1045 or transitive_rules[n_type] == i_replacement 1046 ) 1047 transitive_rules[n_type] = i_replacement 1048 1049 continue 1050 1051 if i_script is not None and i_script == r_script: 1052 # This case currently doesn't occur, so we don't yet support it. 1053 raise ValueError( 1054 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1055 ) 1056 if i_region is not None and i_region == r_region: 1057 # This case currently only applies for sign language 1058 # replacements. Similar to the language subtag case any other 1059 # combination isn't currently supported. 1060 assert type == (None, None, Any, None) 1061 assert replacement == (None, None, Any, None) 1062 assert i_type == ("sgn", None, Any, None) 1063 assert i_replacement == (Any, None, None, None) 1064 1065 n_type = ("sgn", None, region, None) 1066 1067 assert n_type not in transitive_rules 1068 transitive_rules[n_type] = i_replacement 1069 1070 continue 1071 1072 if i_variants is not None and i_variants == r_variants: 1073 # This case currently doesn't occur, so we don't yet support it. 1074 raise ValueError( 1075 f"{type} -> {replacement} :: {i_type} -> {i_replacement}" 1076 ) 1077 1078 # Ensure there are no contradicting rules. 1079 assert all( 1080 rules[type] == replacement 1081 for (type, replacement) in transitive_rules.items() 1082 if type in rules 1083 ) 1084 1085 # If |transitive_rules| is not a subset of |rules|, new rules will be added. 1086 modified_rules = not (transitive_rules.keys() <= rules.keys()) 1087 1088 # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}" 1089 # case. Failing this assertion means either there's a bug when computing the 1090 # stop condition of this loop or a new kind of legacy language tags was added. 1091 if modified_rules and loop_count > 1: 1092 new_rules = {k for k in transitive_rules.keys() if k not in rules} 1093 for k in new_rules: 1094 assert k in { 1095 (Any, None, None, "guoyu-hakka"), 1096 (Any, None, None, "guoyu-xiang"), 1097 } 1098 1099 # Merge the transitive rules. 1100 rules.update(transitive_rules) 1101 1102 # Computes the size of the union of all field value sets. 1103 def multi_map_size(locale_id): 1104 (language, script, region, variants) = locale_id 1105 1106 return ( 1107 (1 if language is not None else 0) 1108 + (1 if script is not None else 0) 1109 + (1 if region is not None else 0) 1110 + (len(variants.split("-")) if variants is not None else 0) 1111 ) 1112 1113 # Dictionary of legacy mappings, contains raw rules, e.g. 1114 # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97"). 1115 legacy_mappings = {} 1116 1117 # Dictionary of simple language subtag mappings, e.g. "in" -> "id". 1118 language_mappings = {} 1119 1120 # Dictionary of complex language subtag mappings, modifying more than one 1121 # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME"). 1122 complex_language_mappings = {} 1123 1124 # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh". 1125 script_mappings = {} 1126 1127 # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE". 1128 region_mappings = {} 1129 1130 # Dictionary of complex region subtag mappings, containing more than one 1131 # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]). 1132 complex_region_mappings = {} 1133 1134 # Dictionary of aliased variant subtags to a tuple of preferred replacement 1135 # type and replacement, e.g. "arevela" -> ("language", "hy") or 1136 # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97"). 1137 variant_mappings = {} 1138 1139 # Preprocess all rules so we can perform a single lookup per subtag at runtime. 1140 for type, replacement in rules.items(): 1141 (language, script, region, variants) = type 1142 (r_language, r_script, r_region, r_variants) = replacement 1143 1144 type_map_size = multi_map_size(type) 1145 1146 # Most mappings are one-to-one and can be encoded through lookup tables. 1147 if type_map_size == 1: 1148 if language is not None: 1149 assert r_language is not None, "Can't remove a language subtag" 1150 1151 # We don't yet support this case. 1152 assert r_variants is None, ( 1153 f"Unhandled variant replacement in language alias: {replacement}" 1154 ) 1155 1156 if replacement == (Any, None, None, None): 1157 language_mappings[language] = r_language 1158 else: 1159 complex_language_mappings[language] = replacement[:-1] 1160 elif script is not None: 1161 # We don't support removing script subtags. 1162 assert r_script is not None, ( 1163 f"Can't remove a script subtag: {replacement}" 1164 ) 1165 1166 # We only support one-to-one script mappings for now. 1167 assert replacement == ( 1168 None, 1169 Any, 1170 None, 1171 None, 1172 ), f"Unhandled replacement in script alias: {replacement}" 1173 1174 script_mappings[script] = r_script 1175 elif region is not None: 1176 # We don't support removing region subtags. 1177 assert r_region is not None, ( 1178 f"Can't remove a region subtag: {replacement}" 1179 ) 1180 1181 # We only support one-to-one region mappings for now. 1182 assert replacement == ( 1183 None, 1184 None, 1185 Any, 1186 None, 1187 ), f"Unhandled replacement in region alias: {replacement}" 1188 1189 if type not in territory_exception_rules: 1190 region_mappings[region] = r_region 1191 else: 1192 complex_region_mappings[region] = [ 1193 r_region 1194 for (_, _, r_region, _) in territory_exception_rules[type] 1195 ] 1196 else: 1197 assert variants is not None 1198 assert len(variants.split("-")) == 1 1199 1200 # We only support one-to-one variant mappings for now. 1201 assert multi_map_size(replacement) <= 1, ( 1202 f"Unhandled replacement in variant alias: {replacement}" 1203 ) 1204 1205 if r_language is not None: 1206 variant_mappings[variants] = ("language", r_language) 1207 elif r_script is not None: 1208 variant_mappings[variants] = ("script", r_script) 1209 elif r_region is not None: 1210 variant_mappings[variants] = ("region", r_region) 1211 elif r_variants is not None: 1212 assert len(r_variants.split("-")) == 1 1213 variant_mappings[variants] = ("variant", r_variants) 1214 else: 1215 variant_mappings[variants] = None 1216 else: 1217 # Alias rules which have multiple input fields must be processed 1218 # first. This applies only to a handful of rules, so our generated 1219 # code adds fast paths to skip these rules in the common case. 1220 1221 # Case 1: Language and at least one variant subtag. 1222 if language is not None and variants is not None: 1223 pass 1224 1225 # Case 2: Sign language and a region subtag. 1226 elif language == "sgn" and region is not None: 1227 pass 1228 1229 # Case 3: "hepburn-heploc" to "alalc97" canonicalization. 1230 elif ( 1231 language is None 1232 and variants is not None 1233 and len(variants.split("-")) == 2 1234 ): 1235 pass 1236 1237 # Any other combination is currently unsupported. 1238 else: 1239 raise ValueError(f"{type} -> {replacement}") 1240 1241 legacy_mappings[type] = replacement 1242 1243 tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) 1244 1245 likely_subtags = {} 1246 1247 for likely_subtag in tree.iterfind(".//likelySubtag"): 1248 from_tag = bcp47_id(likely_subtag.get("from")) 1249 from_match = re_unicode_language_id.match(from_tag) 1250 assert from_match is not None, ( 1251 f"{from_tag} invalid Unicode BCP 47 locale identifier" 1252 ) 1253 assert from_match.group("variants") is None, ( 1254 f"unexpected variant subtags in {from_tag}" 1255 ) 1256 1257 to_tag = bcp47_id(likely_subtag.get("to")) 1258 to_match = re_unicode_language_id.match(to_tag) 1259 assert to_match is not None, ( 1260 f"{to_tag} invalid Unicode BCP 47 locale identifier" 1261 ) 1262 assert to_match.group("variants") is None, ( 1263 f"unexpected variant subtags in {to_tag}" 1264 ) 1265 1266 from_canonical = bcp47_canonical( 1267 *from_match.group("language", "script", "region", "variants") 1268 ) 1269 1270 to_canonical = bcp47_canonical( 1271 *to_match.group("language", "script", "region", "variants") 1272 ) 1273 1274 # Remove the empty variant subtags. 1275 from_canonical = from_canonical[:-1] 1276 to_canonical = to_canonical[:-1] 1277 1278 likely_subtags[from_canonical] = to_canonical 1279 1280 complex_region_mappings_final = {} 1281 1282 for deprecated_region, replacements in complex_region_mappings.items(): 1283 # Find all likely subtag entries which don't already contain a region 1284 # subtag and whose target region is in the list of replacement regions. 1285 region_likely_subtags = [ 1286 (from_language, from_script, to_region) 1287 for ( 1288 (from_language, from_script, from_region), 1289 (_, _, to_region), 1290 ) in likely_subtags.items() 1291 if from_region is None and to_region in replacements 1292 ] 1293 1294 # The first replacement entry is the default region. 1295 default = replacements[0] 1296 1297 # Find all likely subtag entries whose region matches the default region. 1298 default_replacements = { 1299 (language, script) 1300 for (language, script, region) in region_likely_subtags 1301 if region == default 1302 } 1303 1304 # And finally find those entries which don't use the default region. 1305 # These are the entries we're actually interested in, because those need 1306 # to be handled specially when selecting the correct preferred region. 1307 non_default_replacements = [ 1308 (language, script, region) 1309 for (language, script, region) in region_likely_subtags 1310 if (language, script) not in default_replacements 1311 ] 1312 1313 # Remove redundant mappings. 1314 # 1315 # For example starting with CLDR 43, the deprecated region "SU" has the 1316 # following non-default replacement entries for "GE": 1317 # - ('sva', None, 'GE') 1318 # - ('sva', 'Cyrl', 'GE') 1319 # - ('sva', 'Latn', 'GE') 1320 # 1321 # The latter two entries are redundant, because they're already handled 1322 # by the first entry. 1323 non_default_replacements = [ 1324 (language, script, region) 1325 for (language, script, region) in non_default_replacements 1326 if script is None 1327 or (language, None, region) not in non_default_replacements 1328 ] 1329 1330 # If there are no non-default replacements, we can handle the region as 1331 # part of the simple region mapping. 1332 if non_default_replacements: 1333 complex_region_mappings_final[deprecated_region] = ( 1334 default, 1335 non_default_replacements, 1336 ) 1337 else: 1338 region_mappings[deprecated_region] = default 1339 1340 return { 1341 "legacyMappings": legacy_mappings, 1342 "languageMappings": language_mappings, 1343 "complexLanguageMappings": complex_language_mappings, 1344 "scriptMappings": script_mappings, 1345 "regionMappings": region_mappings, 1346 "complexRegionMappings": complex_region_mappings_final, 1347 "variantMappings": variant_mappings, 1348 "likelySubtags": likely_subtags, 1349 } 1350 1351 1352 def readUnicodeExtensions(core_file): 1353 import xml.etree.ElementTree as ET 1354 1355 # Match all xml-files in the BCP 47 directory. 1356 bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$") 1357 1358 # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier 1359 # 1360 # type = alphanum{3,8} (sep alphanum{3,8})* ; 1361 typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$") 1362 1363 # https://www.unicode.org/reports/tr35/#Unicode_language_identifier 1364 # 1365 # unicode_region_subtag = alpha{2} ; 1366 alphaRegionRE = re.compile(r"^[A-Z]{2}$", re.IGNORECASE) 1367 1368 # Mapping from Unicode extension types to dict of deprecated to 1369 # preferred values. 1370 mapping = { 1371 # Unicode BCP 47 U Extension 1372 "u": {}, 1373 # Unicode BCP 47 T Extension 1374 "t": {}, 1375 } 1376 1377 def readBCP47File(file): 1378 tree = ET.parse(file) 1379 for keyword in tree.iterfind(".//keyword/key"): 1380 extension = keyword.get("extension", "u") 1381 assert extension in {"u", "t"}, f"unknown extension type: {extension}" 1382 1383 extension_name = keyword.get("name") 1384 1385 for type in keyword.iterfind("type"): 1386 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1387 # 1388 # The key or type name used by Unicode locale extension with 'u' extension 1389 # syntax or the 't' extensions syntax. When alias below is absent, this name 1390 # can be also used with the old style "@key=type" syntax. 1391 name = type.get("name") 1392 1393 # Ignore the special name: 1394 # - <https://unicode.org/reports/tr35/#CODEPOINTS> 1395 # - <https://unicode.org/reports/tr35/#REORDER_CODE> 1396 # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE> 1397 # - <https://unicode.org/reports/tr35/#SCRIPT_CODE> 1398 # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE> 1399 # - <https://unicode.org/reports/tr35/#PRIVATE_USE> 1400 if name in ( 1401 "CODEPOINTS", 1402 "REORDER_CODE", 1403 "RG_KEY_VALUE", 1404 "SCRIPT_CODE", 1405 "SUBDIVISION_CODE", 1406 "PRIVATE_USE", 1407 ): 1408 continue 1409 1410 # All other names should match the 'type' production. 1411 assert typeRE.match(name) is not None, ( 1412 f"{name} matches the 'type' production" 1413 ) 1414 1415 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1416 # 1417 # The preferred value of the deprecated key, type or attribute element. 1418 # When a key, type or attribute element is deprecated, this attribute is 1419 # used for specifying a new canonical form if available. 1420 preferred = type.get("preferred") 1421 1422 # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: 1423 # 1424 # The BCP 47 form is the canonical form, and recommended. Other aliases are 1425 # included only for backwards compatibility. 1426 alias = type.get("alias") 1427 1428 # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1429 # 1430 # Use the bcp47 data to replace keys, types, tfields, and tvalues by their 1431 # canonical forms. See Section 3.6.4 U Extension Data Files) and Section 1432 # 3.7.1 T Extension Data Files. The aliases are in the alias attribute 1433 # value, while the canonical is in the name attribute value. 1434 1435 # 'preferred' contains the new preferred name, 'alias' the compatibility 1436 # name, but then there's this entry where 'preferred' and 'alias' are the 1437 # same. So which one to choose? Assume 'preferred' is the actual canonical 1438 # name. 1439 # 1440 # <type name="islamicc" 1441 # description="Civil (algorithmic) Arabic calendar" 1442 # deprecated="true" 1443 # preferred="islamic-civil" 1444 # alias="islamic-civil"/> 1445 1446 if preferred is not None: 1447 assert typeRE.match(preferred), preferred 1448 mapping[extension].setdefault(extension_name, {})[name] = preferred 1449 1450 if alias is not None: 1451 for alias_name in alias.lower().split(" "): 1452 # Ignore alias entries which don't match the 'type' production. 1453 if typeRE.match(alias_name) is None: 1454 continue 1455 1456 # See comment above when 'alias' and 'preferred' are both present. 1457 if ( 1458 preferred is not None 1459 and name in mapping[extension][extension_name] 1460 ): 1461 continue 1462 1463 # Skip over entries where 'name' and 'alias' are equal. 1464 # 1465 # <type name="pst8pdt" 1466 # description="POSIX style time zone for US Pacific Time" 1467 # alias="PST8PDT" 1468 # since="1.8"/> 1469 if name == alias_name: 1470 continue 1471 1472 mapping[extension].setdefault(extension_name, {})[ 1473 alias_name 1474 ] = name 1475 1476 def readSupplementalMetadata(file): 1477 # Find subdivision and region replacements. 1478 # 1479 # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> 1480 # 1481 # Replace aliases in special key values: 1482 # - If there is an 'sd' or 'rg' key, replace any subdivision alias 1483 # in its value in the same way, using subdivisionAlias data. 1484 tree = ET.parse(file) 1485 for alias in tree.iterfind(".//subdivisionAlias"): 1486 type = alias.get("type") 1487 assert typeRE.match(type) is not None, ( 1488 f"{type} matches the 'type' production" 1489 ) 1490 1491 # Take the first replacement when multiple ones are present. 1492 replacement = alias.get("replacement").split(" ")[0].lower() 1493 1494 # Append "zzzz" if the replacement is a two-letter region code. 1495 if alphaRegionRE.match(replacement) is not None: 1496 replacement += "zzzz" 1497 1498 # Assert the replacement is syntactically correct. 1499 assert typeRE.match(replacement) is not None, ( 1500 f"replacement {replacement} matches the 'type' production" 1501 ) 1502 1503 # 'subdivisionAlias' applies to 'rg' and 'sd' keys. 1504 mapping["u"].setdefault("rg", {})[type] = replacement 1505 mapping["u"].setdefault("sd", {})[type] = replacement 1506 1507 for name in core_file.namelist(): 1508 if bcpFileRE.match(name): 1509 readBCP47File(core_file.open(name)) 1510 1511 readSupplementalMetadata( 1512 core_file.open("common/supplemental/supplementalMetadata.xml") 1513 ) 1514 1515 return { 1516 "unicodeMappings": mapping["u"], 1517 "transformMappings": mapping["t"], 1518 } 1519 1520 1521 def writeCLDRLanguageTagData(println, data, url): 1522 """Writes the language tag data to the Intl data file.""" 1523 1524 println(generatedFileWarning) 1525 println("// Version: CLDR-{}".format(data["version"])) 1526 println(f"// URL: {url}") 1527 1528 println( 1529 """ 1530 #include "mozilla/Assertions.h" 1531 #include "mozilla/Span.h" 1532 #include "mozilla/TextUtils.h" 1533 1534 #include <algorithm> 1535 #include <cstdint> 1536 #include <cstring> 1537 #include <iterator> 1538 #include <string> 1539 1540 #include "mozilla/intl/Locale.h" 1541 1542 using namespace mozilla::intl::LanguageTagLimits; 1543 1544 template <size_t Length, size_t TagLength, size_t SubtagLength> 1545 static inline bool HasReplacement( 1546 const char (&subtags)[Length][TagLength], 1547 const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1548 MOZ_ASSERT(subtag.Length() == TagLength - 1, 1549 "subtag must have the same length as the list of subtags"); 1550 1551 const char* ptr = subtag.Span().data(); 1552 return std::binary_search(std::begin(subtags), std::end(subtags), ptr, 1553 [](const char* a, const char* b) { 1554 return memcmp(a, b, TagLength - 1) < 0; 1555 }); 1556 } 1557 1558 template <size_t Length, size_t TagLength, size_t SubtagLength> 1559 static inline const char* SearchReplacement( 1560 const char (&subtags)[Length][TagLength], const char* (&aliases)[Length], 1561 const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) { 1562 MOZ_ASSERT(subtag.Length() == TagLength - 1, 1563 "subtag must have the same length as the list of subtags"); 1564 1565 const char* ptr = subtag.Span().data(); 1566 auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, 1567 [](const char* a, const char* b) { 1568 return memcmp(a, b, TagLength - 1) < 0; 1569 }); 1570 if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { 1571 return aliases[std::distance(std::begin(subtags), p)]; 1572 } 1573 return nullptr; 1574 } 1575 1576 #ifdef DEBUG 1577 static bool IsAsciiLowercaseAlphanumeric(char c) { 1578 return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); 1579 } 1580 1581 static bool IsAsciiLowercaseAlphanumericOrDash(char c) { 1582 return IsAsciiLowercaseAlphanumeric(c) || c == '-'; 1583 } 1584 1585 static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { 1586 return std::all_of(span.begin(), span.end(), 1587 mozilla::IsAsciiLowercaseAlpha<char>); 1588 } 1589 1590 static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) { 1591 return mozilla::IsAsciiUppercaseAlpha(span[0]) && 1592 std::all_of(span.begin() + 1, span.end(), 1593 mozilla::IsAsciiLowercaseAlpha<char>); 1594 } 1595 1596 static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { 1597 return std::all_of(span.begin(), span.end(), 1598 mozilla::IsAsciiUppercaseAlpha<char>) || 1599 std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); 1600 } 1601 1602 static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { 1603 return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); 1604 } 1605 1606 static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { 1607 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1608 } 1609 1610 static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { 1611 return std::all_of(type.begin(), type.end(), 1612 IsAsciiLowercaseAlphanumericOrDash); 1613 } 1614 1615 static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { 1616 return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); 1617 } 1618 1619 static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { 1620 return std::all_of(type.begin(), type.end(), 1621 IsAsciiLowercaseAlphanumericOrDash); 1622 } 1623 #endif 1624 """.rstrip() 1625 ) 1626 1627 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1628 legacy_mappings = data["legacyMappings"] 1629 language_mappings = data["languageMappings"] 1630 complex_language_mappings = data["complexLanguageMappings"] 1631 script_mappings = data["scriptMappings"] 1632 region_mappings = data["regionMappings"] 1633 complex_region_mappings = data["complexRegionMappings"] 1634 variant_mappings = data["variantMappings"] 1635 unicode_mappings = data["unicodeMappings"] 1636 transform_mappings = data["transformMappings"] 1637 1638 # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; 1639 language_maxlength = 8 1640 1641 # unicode_script_subtag = alpha{4} ; 1642 script_maxlength = 4 1643 1644 # unicode_region_subtag = (alpha{2} | digit{3}) ; 1645 region_maxlength = 3 1646 1647 writeMappingsBinarySearch( 1648 println, 1649 "LanguageMapping", 1650 "LanguageSubtag&", 1651 "language", 1652 "IsStructurallyValidLanguageTag", 1653 "IsCanonicallyCasedLanguageTag", 1654 language_mappings, 1655 language_maxlength, 1656 "Mappings from language subtags to preferred values.", 1657 source, 1658 url, 1659 ) 1660 writeMappingsBinarySearch( 1661 println, 1662 "ComplexLanguageMapping", 1663 "const LanguageSubtag&", 1664 "language", 1665 "IsStructurallyValidLanguageTag", 1666 "IsCanonicallyCasedLanguageTag", 1667 complex_language_mappings.keys(), 1668 language_maxlength, 1669 "Language subtags with complex mappings.", 1670 source, 1671 url, 1672 ) 1673 writeMappingsBinarySearch( 1674 println, 1675 "ScriptMapping", 1676 "ScriptSubtag&", 1677 "script", 1678 "IsStructurallyValidScriptTag", 1679 "IsCanonicallyCasedScriptTag", 1680 script_mappings, 1681 script_maxlength, 1682 "Mappings from script subtags to preferred values.", 1683 source, 1684 url, 1685 ) 1686 writeMappingsBinarySearch( 1687 println, 1688 "RegionMapping", 1689 "RegionSubtag&", 1690 "region", 1691 "IsStructurallyValidRegionTag", 1692 "IsCanonicallyCasedRegionTag", 1693 region_mappings, 1694 region_maxlength, 1695 "Mappings from region subtags to preferred values.", 1696 source, 1697 url, 1698 ) 1699 writeMappingsBinarySearch( 1700 println, 1701 "ComplexRegionMapping", 1702 "const RegionSubtag&", 1703 "region", 1704 "IsStructurallyValidRegionTag", 1705 "IsCanonicallyCasedRegionTag", 1706 complex_region_mappings.keys(), 1707 region_maxlength, 1708 "Region subtags with complex mappings.", 1709 source, 1710 url, 1711 ) 1712 1713 writeComplexLanguageTagMappings( 1714 println, 1715 complex_language_mappings, 1716 "Language subtags with complex mappings.", 1717 source, 1718 url, 1719 ) 1720 writeComplexRegionTagMappings( 1721 println, 1722 complex_region_mappings, 1723 "Region subtags with complex mappings.", 1724 source, 1725 url, 1726 ) 1727 1728 writeVariantTagMappings( 1729 println, 1730 variant_mappings, 1731 "Mappings from variant subtags to preferred values.", 1732 source, 1733 url, 1734 ) 1735 1736 writeLegacyMappingsFunction( 1737 println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url 1738 ) 1739 1740 writeSignLanguageMappingsFunction( 1741 println, legacy_mappings, "Mappings from legacy sign languages.", source, url 1742 ) 1743 1744 writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode") 1745 writeUnicodeExtensionsMappings(println, transform_mappings, "Transform") 1746 1747 1748 def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): 1749 """Writes the likely-subtags test file.""" 1750 1751 println(generatedFileWarning) 1752 1753 source = "CLDR Supplemental Data, version {}".format(data["version"]) 1754 language_mappings = data["languageMappings"] 1755 complex_language_mappings = data["complexLanguageMappings"] 1756 script_mappings = data["scriptMappings"] 1757 region_mappings = data["regionMappings"] 1758 complex_region_mappings = data["complexRegionMappings"] 1759 likely_subtags = data["likelySubtags"] 1760 1761 def bcp47(tag): 1762 (language, script, region) = tag 1763 return "{}{}{}".format( 1764 language, "-" + script if script else "", "-" + region if region else "" 1765 ) 1766 1767 def canonical(tag): 1768 (language, script, region) = tag 1769 1770 # Map deprecated language subtags. 1771 if language in language_mappings: 1772 language = language_mappings[language] 1773 elif language in complex_language_mappings: 1774 (language2, script2, region2) = complex_language_mappings[language] 1775 (language, script, region) = ( 1776 language2, 1777 script if script else script2, 1778 region if region else region2, 1779 ) 1780 1781 # Map deprecated script subtags. 1782 if script in script_mappings: 1783 script = script_mappings[script] 1784 1785 # Map deprecated region subtags. 1786 if region in region_mappings: 1787 region = region_mappings[region] 1788 else: 1789 # Assume no complex region mappings are needed for now. 1790 assert region not in complex_region_mappings, ( 1791 f"unexpected region with complex mappings: {region}" 1792 ) 1793 1794 return (language, script, region) 1795 1796 # https://unicode.org/reports/tr35/#Likely_Subtags 1797 1798 def addLikelySubtags(tag): 1799 # Step 1: Canonicalize. 1800 (language, script, region) = canonical(tag) 1801 if script == "Zzzz": 1802 script = None 1803 if region == "ZZ": 1804 region = None 1805 1806 # Step 2: Lookup. 1807 searches = ( 1808 (language, script, region), 1809 (language, script, None), 1810 (language, None, region), 1811 (language, None, None), 1812 ) 1813 search = next(search for search in searches if search in likely_subtags) 1814 1815 (language_s, script_s, region_s) = search 1816 (language_m, script_m, region_m) = likely_subtags[search] 1817 1818 # Step 3: Return. 1819 return ( 1820 language if language != language_s else language_m, 1821 script if script != script_s else script_m, 1822 region if region != region_s else region_m, 1823 ) 1824 1825 # https://unicode.org/reports/tr35/#Likely_Subtags 1826 def removeLikelySubtags(tag): 1827 # Step 1: Add likely subtags. 1828 max = addLikelySubtags(tag) 1829 1830 # Step 2: Remove variants (doesn't apply here). 1831 1832 # Step 3: Find a match. 1833 (language, script, region) = max 1834 for trial in ( 1835 (language, None, None), 1836 (language, None, region), 1837 (language, script, None), 1838 ): 1839 if addLikelySubtags(trial) == max: 1840 return trial 1841 1842 # Step 4: Return maximized if no match found. 1843 return max 1844 1845 def likely_canonical(from_tag, to_tag): 1846 # Canonicalize the input tag. 1847 from_tag = canonical(from_tag) 1848 1849 # Update the expected result if necessary. 1850 if from_tag in likely_subtags: 1851 to_tag = likely_subtags[from_tag] 1852 1853 # Canonicalize the expected output. 1854 to_canonical = canonical(to_tag) 1855 1856 # Sanity check: This should match the result of |addLikelySubtags|. 1857 assert to_canonical == addLikelySubtags(from_tag) 1858 1859 return to_canonical 1860 1861 # |likely_subtags| contains non-canonicalized tags, so canonicalize it first. 1862 likely_subtags_canonical = { 1863 k: likely_canonical(k, v) for (k, v) in likely_subtags.items() 1864 } 1865 1866 # Add test data for |Intl.Locale.prototype.maximize()|. 1867 writeMappingsVar( 1868 println, 1869 {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()}, 1870 "maxLikelySubtags", 1871 "Extracted from likelySubtags.xml.", 1872 source, 1873 url, 1874 ) 1875 1876 # Use the maximalized tags as the input for the remove likely-subtags test. 1877 minimized = { 1878 tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values() 1879 } 1880 1881 # Add test data for |Intl.Locale.prototype.minimize()|. 1882 writeMappingsVar( 1883 println, 1884 {bcp47(k): bcp47(v) for (k, v) in minimized.items()}, 1885 "minLikelySubtags", 1886 "Extracted from likelySubtags.xml.", 1887 source, 1888 url, 1889 ) 1890 1891 println( 1892 """ 1893 for (let [tag, maximal] of Object.entries(maxLikelySubtags)) { 1894 assertEq(new Intl.Locale(tag).maximize().toString(), maximal); 1895 }""" 1896 ) 1897 1898 println( 1899 """ 1900 for (let [tag, minimal] of Object.entries(minLikelySubtags)) { 1901 assertEq(new Intl.Locale(tag).minimize().toString(), minimal); 1902 }""" 1903 ) 1904 1905 println( 1906 """ 1907 if (typeof reportCompare === "function") 1908 reportCompare(0, 0);""" 1909 ) 1910 1911 1912 def readCLDRVersionFromICU(): 1913 icuDir = os.path.join(topsrcdir, "intl/icu/source") 1914 if not os.path.isdir(icuDir): 1915 raise RuntimeError(f"not a directory: {icuDir}") 1916 1917 reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}') 1918 1919 for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")): 1920 m = reVersion.match(line) 1921 if m: 1922 version = m.group(1) 1923 break 1924 1925 if version is None: 1926 raise RuntimeError("can't resolve CLDR version") 1927 1928 return version 1929 1930 1931 def updateCLDRLangTags(args): 1932 """Update the LanguageTagGenerated.cpp file.""" 1933 version = args.version 1934 url = args.url 1935 out = args.out 1936 filename = args.file 1937 1938 # Determine current CLDR version from ICU. 1939 if version is None: 1940 version = readCLDRVersionFromICU() 1941 1942 url = url.replace("<VERSION>", version) 1943 1944 print("Arguments:") 1945 print("\tCLDR version: %s" % version) 1946 print("\tDownload url: %s" % url) 1947 if filename is not None: 1948 print("\tLocal CLDR common.zip file: %s" % filename) 1949 print("\tOutput file: %s" % out) 1950 print("") 1951 1952 data = { 1953 "version": version, 1954 } 1955 1956 def readFiles(cldr_file): 1957 with ZipFile(cldr_file) as zip_file: 1958 data.update(readSupplementalData(zip_file)) 1959 data.update(readUnicodeExtensions(zip_file)) 1960 1961 print("Processing CLDR data...") 1962 if filename is not None: 1963 print("Always make sure you have the newest CLDR common.zip!") 1964 with open(filename, "rb") as cldr_file: 1965 readFiles(cldr_file) 1966 else: 1967 print("Downloading CLDR common.zip...") 1968 with closing(urlopen(url)) as cldr_file: 1969 cldr_data = io.BytesIO(cldr_file.read()) 1970 readFiles(cldr_data) 1971 1972 print("Writing Intl data...") 1973 with open(out, mode="w", encoding="utf-8", newline="") as f: 1974 println = partial(print, file=f) 1975 1976 writeCLDRLanguageTagData(println, data, url) 1977 1978 print("Writing Intl test data...") 1979 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 1980 test_file = os.path.join( 1981 js_src_builtin_intl_dir, 1982 "../../tests/non262/Intl/Locale/likely-subtags-generated.js", 1983 ) 1984 with open(test_file, mode="w", encoding="utf-8", newline="") as f: 1985 println = partial(print, file=f) 1986 1987 println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))") 1988 writeCLDRLanguageTagLikelySubtagsTest(println, data, url) 1989 1990 1991 def flines(filepath, encoding="utf-8"): 1992 """Open filepath and iterate over its content.""" 1993 with open(filepath, encoding=encoding) as f: 1994 yield from f 1995 1996 1997 @total_ordering 1998 class Zone: 1999 """Time zone with optional file name.""" 2000 2001 def __init__(self, name, filename=""): 2002 self.name = name 2003 self.filename = filename 2004 2005 def __eq__(self, other): 2006 return hasattr(other, "name") and self.name == other.name 2007 2008 def __lt__(self, other): 2009 return self.name < other.name 2010 2011 def __hash__(self): 2012 return hash(self.name) 2013 2014 def __str__(self): 2015 return self.name 2016 2017 def __repr__(self): 2018 return self.name 2019 2020 2021 class TzDataDir: 2022 """tzdata source from a directory.""" 2023 2024 def __init__(self, obj): 2025 self.name = partial(os.path.basename, obj) 2026 self.resolve = partial(os.path.join, obj) 2027 self.basename = os.path.basename 2028 self.isfile = os.path.isfile 2029 self.listdir = partial(os.listdir, obj) 2030 self.readlines = flines 2031 2032 2033 class TzDataFile: 2034 """tzdata source from a file (tar or gzipped).""" 2035 2036 def __init__(self, obj): 2037 self.name = lambda: os.path.splitext( 2038 os.path.splitext(os.path.basename(obj))[0] 2039 )[0] 2040 self.resolve = obj.getmember 2041 self.basename = attrgetter("name") 2042 self.isfile = tarfile.TarInfo.isfile 2043 self.listdir = obj.getnames 2044 self.readlines = partial(self._tarlines, obj) 2045 2046 def _tarlines(self, tar, m): 2047 with closing(tar.extractfile(m)) as f: 2048 for line in f: 2049 yield line.decode("utf-8") 2050 2051 2052 def validateTimeZones(zones, links): 2053 """Validate the zone and link entries.""" 2054 linkZones = set(links.keys()) 2055 intersect = linkZones.intersection(zones) 2056 if intersect: 2057 raise RuntimeError("Links also present in zones: %s" % intersect) 2058 2059 zoneNames = {z.name for z in zones} 2060 linkTargets = set(links.values()) 2061 if not linkTargets.issubset(zoneNames): 2062 raise RuntimeError( 2063 "Link targets not found: %s" % linkTargets.difference(zoneNames) 2064 ) 2065 2066 2067 def partition(iterable, *predicates): 2068 def innerPartition(pred, it): 2069 it1, it2 = tee(it) 2070 return (filter(pred, it1), filterfalse(pred, it2)) 2071 2072 if len(predicates) == 0: 2073 return iterable 2074 (left, right) = innerPartition(predicates[0], iterable) 2075 if len(predicates) == 1: 2076 return (left, right) 2077 return tuple([left] + list(partition(right, *predicates[1:]))) 2078 2079 2080 def listIANAFiles(tzdataDir): 2081 def isTzFile(d, m, f): 2082 return m(f) and d.isfile(d.resolve(f)) 2083 2084 return filter( 2085 partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match), 2086 tzdataDir.listdir(), 2087 ) 2088 2089 2090 def readIANAFiles(tzdataDir, files): 2091 """Read all IANA time zone files from the given iterable.""" 2092 nameSyntax = r"[\w/+\-]+" 2093 pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax) 2094 pLink = re.compile( 2095 r"(#PACKRATLIST\s+zone.tab\s+)?Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?" 2096 % (nameSyntax, nameSyntax) 2097 ) 2098 2099 def createZone(line, fname): 2100 match = pZone.match(line) 2101 name = match.group("name") 2102 return Zone(name, fname) 2103 2104 def createLink(line, fname): 2105 match = pLink.match(line) 2106 (name, target) = match.group("name", "target") 2107 return (Zone(name, fname), target) 2108 2109 zones = set() 2110 links = dict() 2111 packrat_links = dict() 2112 for filename in files: 2113 filepath = tzdataDir.resolve(filename) 2114 for line in tzdataDir.readlines(filepath): 2115 if line.startswith("Zone"): 2116 zones.add(createZone(line, filename)) 2117 if line.startswith("Link"): 2118 (link, target) = createLink(line, filename) 2119 links[link] = target 2120 if line.startswith("#PACKRATLIST zone.tab Link"): 2121 (link, target) = createLink(line, filename) 2122 packrat_links[link] = target 2123 2124 return (zones, links, packrat_links) 2125 2126 2127 def readIANATimeZones(tzdataDir, ignoreFactory): 2128 """Read the IANA time zone information from `tzdataDir`.""" 2129 2130 files_to_ignore = ["backzone"] 2131 2132 # Ignore the placeholder time zone "Factory". 2133 if ignoreFactory: 2134 files_to_ignore.append("factory") 2135 2136 tzfiles = (file for file in listIANAFiles(tzdataDir) if file not in files_to_ignore) 2137 2138 # Read zone and link infos. 2139 (zones, links, _) = readIANAFiles(tzdataDir, tzfiles) 2140 2141 validateTimeZones(zones, links) 2142 2143 return (zones, links) 2144 2145 2146 def readICUResourceFile(filename): 2147 """Read an ICU resource file. 2148 2149 Yields (<table-name>, <startOrEnd>, <value>) for each table. 2150 """ 2151 2152 numberValue = r"-?\d+" 2153 stringValue = r'".+?"' 2154 2155 def asVector(val): 2156 return r"%s(?:\s*,\s*%s)*" % (val, val) 2157 2158 numberVector = asVector(numberValue) 2159 stringVector = asVector(stringValue) 2160 2161 reNumberVector = re.compile(numberVector) 2162 reStringVector = re.compile(stringVector) 2163 reNumberValue = re.compile(numberValue) 2164 reStringValue = re.compile(stringValue) 2165 2166 def parseValue(value): 2167 m = reNumberVector.match(value) 2168 if m: 2169 return [int(v) for v in reNumberValue.findall(value)] 2170 m = reStringVector.match(value) 2171 if m: 2172 return [v[1:-1] for v in reStringValue.findall(value)] 2173 raise RuntimeError("unknown value type: %s" % value) 2174 2175 def extractValue(values): 2176 if len(values) == 0: 2177 return None 2178 if len(values) == 1: 2179 return values[0] 2180 return values 2181 2182 def line(*args): 2183 maybeMultiComments = r"(?:/\*[^*]*\*/)*" 2184 maybeSingleComment = r"(?://.*)?" 2185 lineStart = "^%s" % maybeMultiComments 2186 lineEnd = r"%s\s*%s$" % (maybeMultiComments, maybeSingleComment) 2187 return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd]))) 2188 2189 tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)' 2190 tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector) 2191 2192 reStartTable = line(tableName, r"\{") 2193 reEndTable = line(r"\}") 2194 reSingleValue = line(r",?", tableValue, r",?") 2195 reCompactTable = line(tableName, r"\{", tableValue, r"\}") 2196 reEmptyLine = line() 2197 2198 tables = [] 2199 2200 def currentTable(): 2201 return "|".join(tables) 2202 2203 values = [] 2204 for line in flines(filename, "utf-8-sig"): 2205 line = line.strip() 2206 if line == "": 2207 continue 2208 2209 m = reEmptyLine.match(line) 2210 if m: 2211 continue 2212 2213 m = reStartTable.match(line) 2214 if m: 2215 assert len(values) == 0 2216 tables.append(m.group("name")) 2217 continue 2218 2219 m = reEndTable.match(line) 2220 if m: 2221 yield (currentTable(), extractValue(values)) 2222 tables.pop() 2223 values = [] 2224 continue 2225 2226 m = reCompactTable.match(line) 2227 if m: 2228 assert len(values) == 0 2229 tables.append(m.group("name")) 2230 yield (currentTable(), extractValue(parseValue(m.group("value")))) 2231 tables.pop() 2232 continue 2233 2234 m = reSingleValue.match(line) 2235 if m and tables: 2236 values.extend(parseValue(m.group("value"))) 2237 continue 2238 2239 raise RuntimeError("unknown entry: %s" % line) 2240 2241 2242 def readICUTimeZonesFromTimezoneTypes(icuTzDir): 2243 """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt 2244 and returns the tuple (zones, links). 2245 """ 2246 typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|" 2247 typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|" 2248 2249 def toTimeZone(name): 2250 return Zone(name.replace(":", "/")) 2251 2252 zones = set() 2253 links = dict() 2254 2255 for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")): 2256 if name.startswith(typeMapTimeZoneKey): 2257 zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :])) 2258 if name.startswith(typeAliasTimeZoneKey): 2259 links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value 2260 2261 validateTimeZones(zones, links) 2262 2263 return (zones, links) 2264 2265 2266 def readICUTimeZonesFromZoneInfo(icuTzDir): 2267 """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt 2268 and returns the tuple (zones, links). 2269 """ 2270 zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table" 2271 linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int" 2272 namesKey = "zoneinfo64:table(nofallback)|Names" 2273 2274 tzId = 0 2275 tzLinks = dict() 2276 tzNames = [] 2277 2278 for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")): 2279 if name == zoneKey: 2280 tzId += 1 2281 elif name == linkKey: 2282 tzLinks[tzId] = int(value) 2283 tzId += 1 2284 elif name == namesKey: 2285 tzNames.extend(value) 2286 2287 links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()} 2288 zones = {Zone(v) for v in tzNames if Zone(v) not in links} 2289 2290 validateTimeZones(zones, links) 2291 2292 return (zones, links) 2293 2294 2295 def readICUTimeZones(icuDir, icuTzDir, ignoreFactory): 2296 # zoneinfo64.txt contains the supported time zones by ICU. This data is 2297 # generated from tzdata files, it doesn't include "backzone" in stock ICU. 2298 (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir) 2299 2300 # timezoneTypes.txt contains the canonicalization information for ICU. This 2301 # data is generated from CLDR files. It includes data about time zones from 2302 # tzdata's "backzone" file. 2303 (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir) 2304 2305 # Remove the placeholder time zone "Factory". 2306 # See also <https://github.com/eggert/tz/blob/master/factory>. 2307 if ignoreFactory: 2308 assert Zone("Factory") in zoneinfoZones 2309 assert Zone("Factory") not in zoneinfoLinks 2310 assert Zone("Factory") not in typesZones 2311 assert Zone("Factory") in typesLinks 2312 2313 zoneinfoZones.remove(Zone("Factory")) 2314 del typesLinks[Zone("Factory")] 2315 2316 # Remove the ICU placeholder time zone "Etc/Unknown". 2317 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2318 for zones in (zoneinfoZones, typesZones): 2319 zones.remove(Zone("Etc/Unknown")) 2320 2321 # Remove any outdated ICU links. 2322 for links in (zoneinfoLinks, typesLinks): 2323 for zone in otherICULegacyLinks().keys(): 2324 if zone not in links: 2325 raise KeyError(f"Can't remove non-existent link from '{zone}'") 2326 del links[zone] 2327 2328 # Information in zoneinfo64 should be a superset of timezoneTypes. 2329 def inZoneInfo64(zone): 2330 return zone in zoneinfoZones or zone in zoneinfoLinks 2331 2332 notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)] 2333 if notFoundInZoneInfo64: 2334 raise RuntimeError( 2335 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2336 ) 2337 2338 notFoundInZoneInfo64 = [ 2339 zone for zone in typesLinks.keys() if not inZoneInfo64(zone) 2340 ] 2341 if notFoundInZoneInfo64: 2342 raise RuntimeError( 2343 "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64 2344 ) 2345 2346 # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization 2347 # rules are defined through timezoneTypes.txt. Merge both to get the actual zones 2348 # and links used by ICU. 2349 icuZones = set( 2350 chain( 2351 (zone for zone in zoneinfoZones if zone not in typesLinks), 2352 (zone for zone in typesZones), 2353 ) 2354 ) 2355 icuLinks = dict( 2356 chain( 2357 ( 2358 (zone, target) 2359 for (zone, target) in zoneinfoLinks.items() 2360 if zone not in typesZones 2361 ), 2362 ((zone, target) for (zone, target) in typesLinks.items()), 2363 ) 2364 ) 2365 2366 return (icuZones, icuLinks) 2367 2368 2369 def readICULegacyZones(icuDir): 2370 """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones 2371 and returns the tuple (zones, links). 2372 """ 2373 tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode")) 2374 2375 # Per spec we must recognize only IANA time zones and links, but ICU 2376 # recognizes various legacy, non-IANA time zones and links. Compute these 2377 # non-IANA time zones and links. 2378 2379 # Most legacy, non-IANA time zones and links are in the icuzones file. 2380 (zones, links, _) = readIANAFiles(tzdir, ["icuzones"]) 2381 2382 # Remove the ICU placeholder time zone "Etc/Unknown". 2383 # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>. 2384 zones.remove(Zone("Etc/Unknown")) 2385 2386 # A handful of non-IANA zones/links are not in icuzones and must be added 2387 # manually so that we won't invoke ICU with them. 2388 for zone, target in otherICULegacyLinks().items(): 2389 if zone in links: 2390 if links[zone] != target: 2391 raise KeyError( 2392 f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'" 2393 ) 2394 else: 2395 print( 2396 f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()" 2397 ) 2398 links[zone] = target 2399 2400 return (zones, links) 2401 2402 2403 def otherICULegacyLinks(): 2404 """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time 2405 zones with the exception of time zones which are removed by IANA after an 2406 ICU release. 2407 2408 For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from 2409 "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates 2410 don't include modified icuzones files, so we must manually record any IANA 2411 modifications here. 2412 2413 After an ICU update, we can remove any no longer needed entries from this 2414 function by checking if the relevant entries are now included in icuzones. 2415 """ 2416 2417 return { 2418 # Current ICU is up-to-date with IANA, so this dict is empty. 2419 } 2420 2421 2422 def icuTzDataVersion(icuTzDir): 2423 """Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt.""" 2424 2425 def searchInFile(pattern, f): 2426 p = re.compile(pattern) 2427 for line in flines(f, "utf-8-sig"): 2428 m = p.search(line) 2429 if m: 2430 return m.group(1) 2431 return None 2432 2433 zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt") 2434 if not os.path.isfile(zoneinfo): 2435 raise RuntimeError("file not found: %s" % zoneinfo) 2436 version = searchInFile(r"^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo) 2437 if version is None: 2438 raise RuntimeError( 2439 "%s does not contain a valid tzdata version string" % zoneinfo 2440 ) 2441 return version 2442 2443 2444 def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks): 2445 """Find incorrect ICU zone entries.""" 2446 2447 def isIANATimeZone(zone): 2448 return zone in ianaZones or zone in ianaLinks 2449 2450 def isICUTimeZone(zone): 2451 return zone in icuZones or zone in icuLinks 2452 2453 def isICULink(zone): 2454 return zone in icuLinks 2455 2456 # All IANA zones should be present in ICU. 2457 missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)] 2458 if missingTimeZones: 2459 raise RuntimeError( 2460 "Not all zones are present in ICU, did you forget " 2461 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2462 ) 2463 2464 # Zones which are only present in ICU? 2465 additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)] 2466 if additionalTimeZones: 2467 raise RuntimeError( 2468 "Additional zones present in ICU, did you forget " 2469 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2470 ) 2471 2472 # Zones which are marked as links in ICU. 2473 result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone)) 2474 2475 # Remove unnecessary UTC mappings. 2476 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2477 result = ((zone, target) for (zone, target) in result if zone.name not in utcnames) 2478 2479 return sorted(result, key=itemgetter(0)) 2480 2481 2482 def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks): 2483 """Find incorrect ICU link entries.""" 2484 2485 def isIANATimeZone(zone): 2486 return zone in ianaZones or zone in ianaLinks 2487 2488 def isICUTimeZone(zone): 2489 return zone in icuZones or zone in icuLinks 2490 2491 def isICULink(zone): 2492 return zone in icuLinks 2493 2494 def isICUZone(zone): 2495 return zone in icuZones 2496 2497 # All links should be present in ICU. 2498 missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)] 2499 if missingTimeZones: 2500 raise RuntimeError( 2501 "Not all zones are present in ICU, did you forget " 2502 "to run intl/update-tzdata.sh? %s" % missingTimeZones 2503 ) 2504 2505 # Links which are only present in ICU? 2506 additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)] 2507 if additionalTimeZones: 2508 raise RuntimeError( 2509 "Additional links present in ICU, did you forget " 2510 "to run intl/update-tzdata.sh? %s" % additionalTimeZones 2511 ) 2512 2513 result = chain( 2514 # IANA links which have a different target in ICU. 2515 ( 2516 (zone, target, icuLinks[zone]) 2517 for (zone, target) in ianaLinks.items() 2518 if isICULink(zone) and target != icuLinks[zone] 2519 ), 2520 # IANA links which are zones in ICU. 2521 ( 2522 (zone, target, zone.name) 2523 for (zone, target) in ianaLinks.items() 2524 if isICUZone(zone) 2525 ), 2526 ) 2527 2528 # Remove unnecessary UTC mappings. 2529 utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"] 2530 result = ( 2531 (zone, target, icuTarget) 2532 for (zone, target, icuTarget) in result 2533 if target not in utcnames or icuTarget not in utcnames 2534 ) 2535 2536 return sorted(result, key=itemgetter(0)) 2537 2538 2539 def readZoneTab(tzdataDir): 2540 zone_country = dict() 2541 2542 zonetab_path = tzdataDir.resolve("zone.tab") 2543 for line in tzdataDir.readlines(zonetab_path): 2544 if line.startswith("#"): 2545 continue 2546 (country, coords, zone, *comments) = line.strip().split("\t") 2547 assert zone not in zone_country 2548 zone_country[zone] = country 2549 2550 return zone_country 2551 2552 2553 # 6.5.1 AvailableNamedTimeZoneIdentifiers ( ) 2554 # 2555 # https://tc39.es/ecma402/#sup-availablenamedtimezoneidentifiers 2556 def availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory): 2557 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 2558 2559 with open( 2560 os.path.join(js_src_builtin_intl_dir, "TimeZoneMapping.yaml"), 2561 encoding="utf-8", 2562 ) as f: 2563 time_zone_mapping = yaml.safe_load(f) 2564 2565 zone_country = readZoneTab(tzdataDir) 2566 2567 def country_code_for(name): 2568 if name in zone_country: 2569 return zone_country[name] 2570 return time_zone_mapping[name] 2571 2572 (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreFactory) 2573 2574 (backzones, backlinks, packratlinks) = readIANAFiles(tzdataDir, ["backzone"]) 2575 all_backzone_links = {**backlinks, **packratlinks} 2576 2577 # Steps 1-3. (Not applicable) 2578 2579 # Step 4. 2580 zones = set() 2581 links = dict() 2582 2583 # Step 5. (Partial, only zones) 2584 for zone in ianaZones: 2585 # Step 5.a. 2586 primary = zone 2587 2588 # Step 5.b. (Not applicable for zones) 2589 2590 # Step 5.c. 2591 if primary.name in ["Etc/UTC", "Etc/GMT", "GMT"]: 2592 primary = Zone("UTC", primary.filename) 2593 2594 # Step 5.d. (Not applicable) 2595 2596 # Steps 5.e-f. 2597 if primary == zone: 2598 assert zone not in zones 2599 zones.add(primary) 2600 else: 2601 assert zone not in links 2602 links[zone] = primary.name 2603 2604 # Step 5. (Partial, only links) 2605 for zone, target in ianaLinks.items(): 2606 identifier = zone.name 2607 2608 # Step 5.a. 2609 primary = identifier 2610 2611 # Step 5.b. 2612 if identifier not in zone_country: 2613 # Step 5.b.i. (Not applicable) 2614 2615 # Steps 5.b.ii-iii. 2616 if target.startswith("Etc/"): 2617 primary = target 2618 else: 2619 # Step 5.b.iii.1. 2620 identifier_code_code = country_code_for(identifier) 2621 2622 # Step 5.b.iii.2. 2623 target_code_code = country_code_for(target) 2624 2625 # Steps 5.b.iii.3-4 2626 if identifier_code_code == target_code_code: 2627 primary = target 2628 else: 2629 # Step 5.b.iii.4.a. 2630 country_code_line_count = [ 2631 zone 2632 for (zone, code) in zone_country.items() 2633 if code == identifier_code_code 2634 ] 2635 2636 # Steps 5.b.iii.4.b-c. 2637 if len(country_code_line_count) == 1: 2638 primary = country_code_line_count[0] 2639 else: 2640 assert Zone(identifier) in all_backzone_links 2641 primary = all_backzone_links[Zone(identifier)] 2642 assert identifier_code_code == country_code_for(primary) 2643 2644 # Step 5.c. 2645 if primary in ["Etc/UTC", "Etc/GMT", "GMT"]: 2646 primary = "UTC" 2647 2648 # Step 5.d. (Not applicable) 2649 2650 # Steps 5.e-f. 2651 if primary == identifier: 2652 assert zone not in zones 2653 zones.add(zone) 2654 else: 2655 assert zone not in links 2656 links[zone] = primary 2657 2658 # Ensure all zones and links are valid. 2659 validateTimeZones(zones, links) 2660 2661 # Step 6. 2662 assert Zone("UTC") in zones 2663 2664 # Step 7. 2665 return (zones, links) 2666 2667 2668 generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT." 2669 tzdataVersionComment = "// tzdata version = {0}" 2670 2671 2672 def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreFactory, out): 2673 """Read the time zone info and create a new time zone cpp file.""" 2674 print("Processing tzdata mapping...") 2675 (ianaZones, ianaLinks) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory) 2676 (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory) 2677 (legacyZones, legacyLinks) = readICULegacyZones(icuDir) 2678 2679 if ignoreFactory: 2680 legacyZones.add(Zone("Factory")) 2681 2682 # Remove all legacy ICU time zones. 2683 icuZones = {zone for zone in icuZones if zone not in legacyZones} 2684 icuLinks = { 2685 zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks 2686 } 2687 2688 incorrectZones = findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks) 2689 if not incorrectZones: 2690 print("<<< No incorrect ICU time zones found, please update Intl.js! >>>") 2691 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2692 2693 incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks) 2694 if not incorrectLinks: 2695 print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>") 2696 print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>") 2697 2698 print("Writing Intl tzdata file...") 2699 with open(out, mode="w", encoding="utf-8", newline="") as f: 2700 println = partial(print, file=f) 2701 2702 println(generatedFileWarning) 2703 println(tzdataVersionComment.format(version)) 2704 println("") 2705 2706 println("#ifndef builtin_intl_TimeZoneDataGenerated_h") 2707 println("#define builtin_intl_TimeZoneDataGenerated_h") 2708 println("") 2709 2710 println("namespace js {") 2711 println("namespace timezone {") 2712 println("") 2713 2714 println("// Format:") 2715 println('// "ZoneName" // ICU-Name [time zone file]') 2716 println("const char* const ianaZonesTreatedAsLinksByICU[] = {") 2717 for zone, icuZone in incorrectZones: 2718 println(' "%s", // %s [%s]' % (zone, icuZone, zone.filename)) 2719 println("};") 2720 println("") 2721 2722 println("// Format:") 2723 println('// "LinkName", "Target" // ICU-Target [time zone file]') 2724 println("struct LinkAndTarget") 2725 println("{") 2726 println(" const char* const link;") 2727 println(" const char* const target;") 2728 println("};") 2729 println("") 2730 println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {") 2731 for zone, target, icuTarget in incorrectLinks: 2732 println( 2733 ' { "%s", "%s" }, // %s [%s]' 2734 % (zone, target, icuTarget, zone.filename) 2735 ) 2736 println("};") 2737 println("") 2738 2739 println( 2740 "// Legacy ICU time zones, these are not valid IANA time zone names. We also" 2741 ) 2742 println("// disallow the old and deprecated System V time zones.") 2743 println( 2744 "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones" 2745 ) # NOQA: E501 2746 println("const char* const legacyICUTimeZones[] = {") 2747 for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)): 2748 println(' "%s",' % zone) 2749 println("};") 2750 println("") 2751 2752 println("} // namespace timezone") 2753 println("} // namespace js") 2754 println("") 2755 println("#endif /* builtin_intl_TimeZoneDataGenerated_h */") 2756 2757 2758 def generateTzDataTestLinks(tzdataDir, version, ignoreFactory, testDir): 2759 fileName = "timeZone_links.js" 2760 2761 # Read zone and link infos. 2762 (_, links) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory) 2763 2764 with open( 2765 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2766 ) as f: 2767 println = partial(print, file=f) 2768 2769 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2770 println("") 2771 println(generatedFileWarning) 2772 println(tzdataVersionComment.format(version)) 2773 println( 2774 """ 2775 const tzMapper = [ 2776 x => x, 2777 x => x.toUpperCase(), 2778 x => x.toLowerCase(), 2779 ]; 2780 """ 2781 ) 2782 2783 println("// Link names derived from IANA Time Zone Database.") 2784 println("const links = {") 2785 for zone, target in sorted(links.items(), key=itemgetter(0)): 2786 println(' "%s": "%s",' % (zone, target)) 2787 println("};") 2788 2789 println( 2790 """ 2791 for (let [linkName, target] of Object.entries(links)) { 2792 if (target === "Etc/UTC" || target === "Etc/GMT") 2793 target = "UTC"; 2794 2795 for (let map of tzMapper) { 2796 let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)}); 2797 let resolvedTimeZone = dtf.resolvedOptions().timeZone; 2798 assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`); 2799 } 2800 } 2801 """ 2802 ) 2803 println( 2804 """ 2805 if (typeof reportCompare === "function") 2806 reportCompare(0, 0, "ok"); 2807 """ 2808 ) 2809 2810 2811 def generateTzDataTestVersion(tzdataDir, version, testDir): 2812 fileName = "timeZone_version.js" 2813 2814 with open( 2815 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2816 ) as f: 2817 println = partial(print, file=f) 2818 2819 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2820 println("") 2821 println(generatedFileWarning) 2822 println(tzdataVersionComment.format(version)) 2823 println(f"""const tzdata = "{version}";""") 2824 2825 println( 2826 """ 2827 if (typeof getICUOptions === "undefined") { 2828 var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions; 2829 } 2830 2831 var options = getICUOptions(); 2832 2833 assertEq(options.tzdata, tzdata); 2834 2835 if (typeof reportCompare === "function") 2836 reportCompare(0, 0, "ok"); 2837 """ 2838 ) 2839 2840 2841 def generateTzDataTestCanonicalZones(tzdataDir, version, ignoreFactory, testDir): 2842 fileName = "supportedValuesOf-timeZones-canonical.js" 2843 2844 # Read zone and link infos. 2845 (zones, _) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory) 2846 2847 with open( 2848 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2849 ) as f: 2850 println = partial(print, file=f) 2851 2852 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 2853 println("") 2854 println(generatedFileWarning) 2855 println(tzdataVersionComment.format(version)) 2856 2857 println("const zones = [") 2858 for zone in sorted(zones): 2859 println(f' "{zone}",') 2860 println("];") 2861 2862 println( 2863 """ 2864 let supported = Intl.supportedValuesOf("timeZone"); 2865 2866 assertEqArray(supported, zones); 2867 2868 if (typeof reportCompare === "function") 2869 reportCompare(0, 0, "ok"); 2870 """ 2871 ) 2872 2873 2874 def generateTzDataTestZones(tzdataDir, version, ignoreFactory, testDir): 2875 fileName = "zones-and-links.js" 2876 2877 # Read zone and link infos. 2878 (zones, links) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory) 2879 2880 with open( 2881 os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline="" 2882 ) as f: 2883 println = partial(print, file=f) 2884 2885 println('// |reftest| skip-if(!this.hasOwnProperty("Temporal"))') 2886 println("") 2887 println(generatedFileWarning) 2888 println(tzdataVersionComment.format(version)) 2889 2890 println("const zones = [") 2891 for zone in sorted(zones): 2892 println(f' "{zone}",') 2893 println("];") 2894 2895 println("const links = {") 2896 for link, target in sorted(links.items(), key=itemgetter(0)): 2897 println(f' "{link}": "{target}",') 2898 println("};") 2899 2900 println( 2901 """ 2902 let epochNanoseconds = [ 2903 new Temporal.PlainDate(1900, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2904 new Temporal.PlainDate(1950, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2905 new Temporal.PlainDate(1960, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2906 new Temporal.PlainDate(1970, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2907 new Temporal.PlainDate(1980, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2908 new Temporal.PlainDate(1990, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2909 new Temporal.PlainDate(2000, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2910 new Temporal.PlainDate(2010, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2911 new Temporal.PlainDate(2020, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2912 new Temporal.PlainDate(2030, 1, 1).toZonedDateTime("UTC").epochNanoseconds, 2913 ]; 2914 2915 function timeZoneId(zdt) { 2916 let str = zdt.toString(); 2917 let m = str.match(/(?<=\\[)[\\w\\/_+-]+(?=\\])/); 2918 assertEq(m !== null, true, str); 2919 return m[0]; 2920 } 2921 2922 for (let zone of zones) { 2923 let zdt = new Temporal.ZonedDateTime(0n, zone); 2924 2925 assertEq(zdt.timeZoneId, zone); 2926 assertEq(timeZoneId(zdt), zone); 2927 } 2928 2929 for (let [link, zone] of Object.entries(links)) { 2930 assertEq(link === zone, false, `link=${link}, zone=${zone}`); 2931 assertEq(zones.includes(zone), true, `zone=${zone}`); 2932 2933 let zdtLink = new Temporal.ZonedDateTime(0n, link); 2934 let zdtZone = new Temporal.ZonedDateTime(0n, zone); 2935 2936 assertEq(zdtLink.timeZoneId, link); 2937 assertEq(timeZoneId(zdtLink), link); 2938 2939 assertEq(zdtZone.timeZoneId, zone); 2940 assertEq(timeZoneId(zdtZone), zone); 2941 2942 assertEq(zdtLink.equals(zdtZone), true, `link=${link}, zone=${zone}`); 2943 2944 assertEq( 2945 zdtLink.offsetNanoseconds, 2946 zdtZone.offsetNanoseconds, 2947 `link=${link}, zone=${zone}` 2948 ); 2949 2950 for (let epochNs of epochNanoseconds) { 2951 assertEq( 2952 new Temporal.ZonedDateTime(epochNs, link).offsetNanoseconds, 2953 new Temporal.ZonedDateTime(epochNs, zone).offsetNanoseconds, 2954 `link=${link}, zone=${zone}, epochNs=${epochNs}` 2955 ); 2956 } 2957 } 2958 2959 if (typeof reportCompare === "function") 2960 reportCompare(0, 0, "ok"); 2961 """ 2962 ) 2963 2964 2965 def generateTzDataTests(tzdataDir, version, ignoreFactory, testDir): 2966 dtfTestDir = os.path.join(testDir, "DateTimeFormat") 2967 if not os.path.isdir(dtfTestDir): 2968 raise RuntimeError("not a directory: %s" % dtfTestDir) 2969 2970 zdtTestDir = os.path.join(testDir, "../Temporal/ZonedDateTime") 2971 if not os.path.isdir(zdtTestDir): 2972 raise RuntimeError("not a directory: %s" % zdtTestDir) 2973 2974 generateTzDataTestLinks(tzdataDir, version, ignoreFactory, dtfTestDir) 2975 generateTzDataTestVersion(tzdataDir, version, dtfTestDir) 2976 generateTzDataTestCanonicalZones(tzdataDir, version, ignoreFactory, testDir) 2977 generateTzDataTestZones(tzdataDir, version, ignoreFactory, zdtTestDir) 2978 2979 2980 def updateTzdata(topsrcdir, args): 2981 """Update the time zone cpp file.""" 2982 2983 icuDir = os.path.join(topsrcdir, "intl/icu/source") 2984 if not os.path.isdir(icuDir): 2985 raise RuntimeError("not a directory: %s" % icuDir) 2986 2987 icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source") 2988 if not os.path.isdir(icuTzDir): 2989 raise RuntimeError("not a directory: %s" % icuTzDir) 2990 2991 intlTestDir = os.path.join(topsrcdir, "js/src/tests/non262/Intl") 2992 if not os.path.isdir(intlTestDir): 2993 raise RuntimeError("not a directory: %s" % intlTestDir) 2994 2995 tzDir = args.tz 2996 if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)): 2997 raise RuntimeError("not a directory or file: %s" % tzDir) 2998 out = args.out 2999 3000 # Ignore the placeholder time zone "Factory". 3001 ignoreFactory = True 3002 3003 version = icuTzDataVersion(icuTzDir) 3004 url = ( 3005 "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version 3006 ) 3007 3008 print("Arguments:") 3009 print("\ttzdata version: %s" % version) 3010 print("\ttzdata URL: %s" % url) 3011 print("\ttzdata directory|file: %s" % tzDir) 3012 print("\tICU directory: %s" % icuDir) 3013 print("\tICU timezone directory: %s" % icuTzDir) 3014 print("\tOutput file: %s" % out) 3015 print("") 3016 3017 def updateFrom(f): 3018 if os.path.isfile(f) and tarfile.is_tarfile(f): 3019 with tarfile.open(f, "r:*") as tar: 3020 processTimeZones( 3021 TzDataFile(tar), 3022 icuDir, 3023 icuTzDir, 3024 version, 3025 ignoreFactory, 3026 out, 3027 ) 3028 generateTzDataTests( 3029 TzDataFile(tar), version, ignoreFactory, intlTestDir 3030 ) 3031 elif os.path.isdir(f): 3032 processTimeZones( 3033 TzDataDir(f), 3034 icuDir, 3035 icuTzDir, 3036 version, 3037 ignoreFactory, 3038 out, 3039 ) 3040 generateTzDataTests(TzDataDir(f), version, ignoreFactory, intlTestDir) 3041 else: 3042 raise RuntimeError("unknown format") 3043 3044 if tzDir is None: 3045 print("Downloading tzdata file...") 3046 with closing(urlopen(url)) as tzfile: 3047 fname = urlsplit(tzfile.geturl()).path.split("/")[-1] 3048 with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile: 3049 print("File stored in %s" % tztmpfile.name) 3050 tztmpfile.write(tzfile.read()) 3051 tztmpfile.flush() 3052 updateFrom(tztmpfile.name) 3053 else: 3054 updateFrom(tzDir) 3055 3056 3057 def readCurrencyFile(tree): 3058 reCurrency = re.compile(r"^[A-Z]{3}$") 3059 reIntMinorUnits = re.compile(r"^\d+$") 3060 3061 for country in tree.iterfind(".//CcyNtry"): 3062 # Skip entry if no currency information is available. 3063 currency = country.findtext("Ccy") 3064 if currency is None: 3065 continue 3066 assert reCurrency.match(currency) 3067 3068 minorUnits = country.findtext("CcyMnrUnts") 3069 assert minorUnits is not None 3070 3071 # Skip all entries without minorUnits or which use the default minorUnits. 3072 if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2: 3073 currencyName = country.findtext("CcyNm") 3074 countryName = country.findtext("CtryNm") 3075 yield (currency, int(minorUnits), currencyName, countryName) 3076 3077 3078 def writeCurrencyFile(published, currencies, out): 3079 with open(out, mode="w", encoding="utf-8", newline="") as f: 3080 println = partial(print, file=f) 3081 3082 println(generatedFileWarning) 3083 println(f"// Version: {published}") 3084 3085 println( 3086 """ 3087 /** 3088 * Mapping from currency codes to the number of decimal digits used for them. 3089 * Default is 2 digits. 3090 * 3091 * Spec: ISO 4217 Currency and Funds Code List. 3092 * http://www.currency-iso.org/en/home/tables/table-a1.html 3093 */""" 3094 ) 3095 println("var currencyDigits = {") 3096 for currency, entries in groupby( 3097 sorted(currencies, key=itemgetter(0)), itemgetter(0) 3098 ): 3099 for _, minorUnits, currencyName, countryName in entries: 3100 println(f" // {currencyName} ({countryName})") 3101 println(f" {currency}: {minorUnits},") 3102 println("};") 3103 3104 3105 def updateCurrency(topsrcdir, args): 3106 """Update the CurrencyDataGenerated.js file.""" 3107 import xml.etree.ElementTree as ET 3108 from random import randint 3109 3110 url = args.url 3111 out = args.out 3112 filename = args.file 3113 3114 print("Arguments:") 3115 print("\tDownload url: %s" % url) 3116 print("\tLocal currency file: %s" % filename) 3117 print("\tOutput file: %s" % out) 3118 print("") 3119 3120 def updateFrom(currencyFile): 3121 print("Processing currency code list file...") 3122 tree = ET.parse(currencyFile) 3123 published = tree.getroot().attrib["Pblshd"] 3124 currencies = readCurrencyFile(tree) 3125 3126 print("Writing CurrencyData file...") 3127 writeCurrencyFile(published, currencies, out) 3128 3129 if filename is not None: 3130 print("Always make sure you have the newest currency code list file!") 3131 updateFrom(filename) 3132 else: 3133 print("Downloading currency & funds code list...") 3134 request = UrlRequest(url) 3135 request.add_header( 3136 "User-agent", 3137 "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format( 3138 randint(1, 999) 3139 ), 3140 ) 3141 with closing(urlopen(request)) as currencyFile: 3142 fname = urlsplit(currencyFile.geturl()).path.split("/")[-1] 3143 with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile: 3144 print("File stored in %s" % currencyTmpFile.name) 3145 currencyTmpFile.write(currencyFile.read()) 3146 currencyTmpFile.flush() 3147 updateFrom(currencyTmpFile.name) 3148 3149 3150 def writeUnicodeExtensionsMappings(println, mapping, extension): 3151 println( 3152 f""" 3153 template <size_t Length> 3154 static inline bool Is{extension}Key(mozilla::Span<const char> key, const char (&str)[Length]) {{ 3155 static_assert(Length == {extension}KeyLength + 1, 3156 "{extension} extension key is two characters long"); 3157 return memcmp(key.data(), str, Length - 1) == 0; 3158 }} 3159 3160 template <size_t Length> 3161 static inline bool Is{extension}Type(mozilla::Span<const char> type, const char (&str)[Length]) {{ 3162 static_assert(Length > {extension}KeyLength + 1, 3163 "{extension} extension type contains more than two characters"); 3164 return type.size() == (Length - 1) && 3165 memcmp(type.data(), str, Length - 1) == 0; 3166 }} 3167 """.rstrip("\n") 3168 ) 3169 3170 linear_search_max_length = 4 3171 3172 needs_binary_search = any( 3173 len(replacements.items()) > linear_search_max_length 3174 for replacements in mapping.values() 3175 ) 3176 3177 if needs_binary_search: 3178 println( 3179 f""" 3180 static int32_t Compare{extension}Type(const char* a, mozilla::Span<const char> b) {{ 3181 MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'), 3182 "unexpected null-character in string"); 3183 3184 using UnsignedChar = unsigned char; 3185 for (size_t i = 0; i < b.size(); i++) {{ 3186 // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if 3187 // we've reached the end of |a|, the below if-statement will always be true. 3188 // That ensures we don't read past the end of |a|. 3189 if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{ 3190 return r; 3191 }} 3192 }} 3193 3194 // Return zero if both strings are equal or a positive number if |b| is a 3195 // prefix of |a|. 3196 return int32_t(UnsignedChar(a[b.size()])); 3197 }} 3198 3199 template <size_t Length> 3200 static inline const char* Search{extension}Replacement( 3201 const char* (&types)[Length], const char* (&aliases)[Length], 3202 mozilla::Span<const char> type) {{ 3203 3204 auto p = std::lower_bound(std::begin(types), std::end(types), type, 3205 [](const auto& a, const auto& b) {{ 3206 return Compare{extension}Type(a, b) < 0; 3207 }}); 3208 if (p != std::end(types) && Compare{extension}Type(*p, type) == 0) {{ 3209 return aliases[std::distance(std::begin(types), p)]; 3210 }} 3211 return nullptr; 3212 }} 3213 """.rstrip("\n") 3214 ) 3215 3216 println( 3217 f""" 3218 /** 3219 * Mapping from deprecated BCP 47 {extension} extension types to their preferred 3220 * values. 3221 * 3222 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files 3223 * Spec: https://www.unicode.org/reports/tr35/#t_Extension 3224 */ 3225 const char* mozilla::intl::Locale::Replace{extension}ExtensionType( 3226 mozilla::Span<const char> key, mozilla::Span<const char> type) {{ 3227 MOZ_ASSERT(key.size() == {extension}KeyLength); 3228 MOZ_ASSERT(IsCanonicallyCased{extension}Key(key)); 3229 3230 MOZ_ASSERT(type.size() > {extension}KeyLength); 3231 MOZ_ASSERT(IsCanonicallyCased{extension}Type(type)); 3232 """ 3233 ) 3234 3235 def to_hash_key(replacements): 3236 return str(sorted(replacements.items())) 3237 3238 def write_array(subtags, name, length): 3239 max_entries = (80 - len(" ")) // (length + len('"", ')) 3240 3241 println(f" static const char* {name}[{len(subtags)}] = {{") 3242 3243 for entries in grouper(subtags, max_entries): 3244 entries = ( 3245 f'"{tag}"'.center(length + 2) for tag in entries if tag is not None 3246 ) 3247 println(" {},".format(", ".join(entries))) 3248 3249 println(" };") 3250 3251 # Merge duplicate keys. 3252 key_aliases = {} 3253 for key, replacements in sorted(mapping.items(), key=itemgetter(0)): 3254 hash_key = to_hash_key(replacements) 3255 if hash_key not in key_aliases: 3256 key_aliases[hash_key] = [] 3257 else: 3258 key_aliases[hash_key].append(key) 3259 3260 first_key = True 3261 for key, replacements in sorted(mapping.items(), key=itemgetter(0)): 3262 hash_key = to_hash_key(replacements) 3263 if key in key_aliases[hash_key]: 3264 continue 3265 3266 cond = (f'Is{extension}Key(key, "{k}")' for k in [key] + key_aliases[hash_key]) 3267 3268 if_kind = "if" if first_key else "else if" 3269 cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond) 3270 println( 3271 f""" 3272 {if_kind} ({cond}) {{""".strip("\n") 3273 ) 3274 first_key = False 3275 3276 replacements = sorted(replacements.items(), key=itemgetter(0)) 3277 3278 if len(replacements) > linear_search_max_length: 3279 types = [t for (t, _) in replacements] 3280 preferred = [r for (_, r) in replacements] 3281 max_len = max(len(k) for k in types + preferred) 3282 3283 write_array(types, "types", max_len) 3284 write_array(preferred, "aliases", max_len) 3285 println( 3286 f""" 3287 return Search{extension}Replacement(types, aliases, type); 3288 """.strip("\n") 3289 ) 3290 else: 3291 for type, replacement in replacements: 3292 println( 3293 f""" 3294 if (Is{extension}Type(type, "{type}")) {{ 3295 return "{replacement}"; 3296 }}""".strip("\n") 3297 ) 3298 3299 println( 3300 """ 3301 }""".lstrip("\n") 3302 ) 3303 3304 println( 3305 """ 3306 return nullptr; 3307 } 3308 """.strip("\n") 3309 ) 3310 3311 3312 def readICUUnitResourceFile(filepath): 3313 """Return a set of unit descriptor pairs where the first entry denotes the unit type and the 3314 second entry the unit name. 3315 3316 Example: 3317 3318 root{ 3319 units{ 3320 compound{ 3321 } 3322 coordinate{ 3323 } 3324 length{ 3325 meter{ 3326 } 3327 } 3328 } 3329 unitsNarrow:alias{"/LOCALE/unitsShort"} 3330 unitsShort{ 3331 duration{ 3332 day{ 3333 } 3334 day-person:alias{"/LOCALE/unitsShort/duration/day"} 3335 } 3336 length{ 3337 meter{ 3338 } 3339 } 3340 } 3341 } 3342 3343 Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")} 3344 """ 3345 3346 start_table_re = re.compile(r"^([\w\-%:\"]+)\{$") 3347 end_table_re = re.compile(r"^\}$") 3348 table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$") 3349 3350 # The current resource table. 3351 table = {} 3352 3353 # List of parent tables when parsing. 3354 parents = [] 3355 3356 # Track multi-line comments state. 3357 in_multiline_comment = False 3358 3359 for line in flines(filepath, "utf-8-sig"): 3360 # Remove leading and trailing whitespace. 3361 line = line.strip() 3362 3363 # Skip over comments. 3364 if in_multiline_comment: 3365 if line.endswith("*/"): 3366 in_multiline_comment = False 3367 continue 3368 3369 if line.startswith("//"): 3370 continue 3371 3372 if line.startswith("/*"): 3373 in_multiline_comment = True 3374 continue 3375 3376 # Try to match the start of a table, e.g. `length{` or `meter{`. 3377 match = start_table_re.match(line) 3378 if match: 3379 parents.append(table) 3380 table_name = match.group(1) 3381 new_table = {} 3382 table[table_name] = new_table 3383 table = new_table 3384 continue 3385 3386 # Try to match the end of a table. 3387 match = end_table_re.match(line) 3388 if match: 3389 table = parents.pop() 3390 continue 3391 3392 # Try to match a table entry, e.g. `dnam{"meter"}`. 3393 match = table_entry_re.match(line) 3394 if match: 3395 entry_key = match.group(1) 3396 entry_value = match.group(2) 3397 table[entry_key] = entry_value 3398 continue 3399 3400 raise Exception(f"unexpected line: '{line}' in {filepath}") 3401 3402 assert len(parents) == 0, "Not all tables closed" 3403 assert len(table) == 1, "More than one root table" 3404 3405 # Remove the top-level language identifier table. 3406 (_, unit_table) = table.popitem() 3407 3408 # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort". 3409 # But exclude the pseudo-units "compound" and "ccoordinate". 3410 return { 3411 (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6]) 3412 for unit_display in ("units", "unitsNarrow", "unitsShort") 3413 if unit_display in unit_table 3414 for (unit_type, unit_names) in unit_table[unit_display].items() 3415 if unit_type not in {"compound", "coordinate"} 3416 for unit_name in unit_names.keys() 3417 } 3418 3419 3420 def computeSupportedUnits(all_units, sanctioned_units): 3421 """Given the set of all possible ICU unit identifiers and the set of sanctioned unit 3422 identifiers, compute the set of effectively supported ICU unit identifiers. 3423 """ 3424 3425 def find_match(unit): 3426 unit_match = [ 3427 (unit_type, unit_name) 3428 for (unit_type, unit_name) in all_units 3429 if unit_name == unit 3430 ] 3431 if unit_match: 3432 assert len(unit_match) == 1 3433 return unit_match[0] 3434 return None 3435 3436 def compound_unit_identifiers(): 3437 for numerator in sanctioned_units: 3438 for denominator in sanctioned_units: 3439 yield f"{numerator}-per-{denominator}" 3440 3441 supported_simple_units = {find_match(unit) for unit in sanctioned_units} 3442 assert None not in supported_simple_units 3443 3444 supported_compound_units = { 3445 unit_match 3446 for unit_match in (find_match(unit) for unit in compound_unit_identifiers()) 3447 if unit_match 3448 } 3449 3450 return supported_simple_units | supported_compound_units 3451 3452 3453 def readICUDataFilterForUnits(data_filter_file): 3454 with open(data_filter_file, encoding="utf-8") as f: 3455 data_filter = json.load(f) 3456 3457 # Find the rule set for the "unit_tree". 3458 unit_tree_rules = [ 3459 entry["rules"] 3460 for entry in data_filter["resourceFilters"] 3461 if entry["categories"] == ["unit_tree"] 3462 ] 3463 assert len(unit_tree_rules) == 1 3464 3465 # Compute the list of included units from that rule set. The regular expression must match 3466 # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound". 3467 included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$") 3468 filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0]) 3469 3470 return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit} 3471 3472 3473 def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units): 3474 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3475 intl_components_src_dir = os.path.join( 3476 js_src_builtin_intl_dir, "../../../../intl/components/src" 3477 ) 3478 3479 def find_unit_type(unit): 3480 result = [ 3481 unit_type for (unit_type, unit_name) in all_units if unit_name == unit 3482 ] 3483 assert result and len(result) == 1 3484 return result[0] 3485 3486 sanctioned_js_file = os.path.join( 3487 js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js" 3488 ) 3489 with open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f: 3490 println = partial(print, file=f) 3491 3492 sanctioned_units_object = json.dumps( 3493 {unit: True for unit in sorted(sanctioned_units)}, 3494 sort_keys=True, 3495 indent=2, 3496 separators=(",", ": "), 3497 ) 3498 3499 println(generatedFileWarning) 3500 3501 println( 3502 """ 3503 /** 3504 * The list of currently supported simple unit identifiers. 3505 * 3506 * Intl.NumberFormat Unified API Proposal 3507 */""" 3508 ) 3509 3510 println("// prettier-ignore") 3511 println(f"var sanctionedSimpleUnitIdentifiers = {sanctioned_units_object};") 3512 3513 sanctioned_h_file = os.path.join(intl_components_src_dir, "MeasureUnitGenerated.h") 3514 with open(sanctioned_h_file, mode="w", encoding="utf-8", newline="") as f: 3515 println = partial(print, file=f) 3516 3517 println(generatedFileWarning) 3518 3519 println( 3520 """ 3521 #ifndef intl_components_MeasureUnitGenerated_h 3522 #define intl_components_MeasureUnitGenerated_h 3523 3524 namespace mozilla::intl { 3525 3526 struct SimpleMeasureUnit { 3527 const char* const type; 3528 const char* const name; 3529 }; 3530 3531 /** 3532 * The list of currently supported simple unit identifiers. 3533 * 3534 * The list must be kept in alphabetical order of |name|. 3535 */ 3536 inline constexpr SimpleMeasureUnit simpleMeasureUnits[] = { 3537 // clang-format off""" 3538 ) 3539 3540 for unit_name in sorted(sanctioned_units): 3541 println(f' {{"{find_unit_type(unit_name)}", "{unit_name}"}},') 3542 3543 println( 3544 """ 3545 // clang-format on 3546 }; 3547 3548 } // namespace mozilla::intl 3549 3550 #endif 3551 """.strip("\n") 3552 ) 3553 3554 writeUnitTestFiles(all_units, sanctioned_units) 3555 3556 3557 def writeUnitTestFiles(all_units, sanctioned_units): 3558 """Generate test files for unit number formatters.""" 3559 3560 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3561 test_dir = os.path.join( 3562 js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat" 3563 ) 3564 3565 def write_test(file_name, test_content, indent=4): 3566 file_path = os.path.join(test_dir, file_name) 3567 with open(file_path, mode="w", encoding="utf-8", newline="") as f: 3568 println = partial(print, file=f) 3569 3570 println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))') 3571 println("") 3572 println(generatedFileWarning) 3573 println("") 3574 3575 sanctioned_units_array = json.dumps( 3576 [unit for unit in sorted(sanctioned_units)], 3577 indent=indent, 3578 separators=(",", ": "), 3579 ) 3580 3581 println( 3582 f"const sanctionedSimpleUnitIdentifiers = {sanctioned_units_array};" 3583 ) 3584 3585 println(test_content) 3586 3587 println( 3588 """ 3589 if (typeof reportCompare === "function") 3590 {}reportCompare(true, true);""".format(" " * indent) 3591 ) 3592 3593 write_test( 3594 "unit-compound-combinations.js", 3595 """ 3596 // Test all simple unit identifier combinations are allowed. 3597 3598 for (const numerator of sanctionedSimpleUnitIdentifiers) { 3599 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3600 const unit = `${numerator}-per-${denominator}`; 3601 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3602 3603 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3604 } 3605 }""", 3606 ) 3607 3608 all_units_array = json.dumps( 3609 ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ") 3610 ) 3611 3612 write_test( 3613 "unit-well-formed.js", 3614 f""" 3615 const allUnits = {all_units_array}; 3616 """ 3617 + r""" 3618 // Test only sanctioned unit identifiers are allowed. 3619 3620 for (const typeAndUnit of allUnits) { 3621 const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/); 3622 3623 let allowed; 3624 if (unit.includes("-per-")) { 3625 const [numerator, denominator] = unit.split("-per-"); 3626 allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) && 3627 sanctionedSimpleUnitIdentifiers.includes(denominator); 3628 } else { 3629 allowed = sanctionedSimpleUnitIdentifiers.includes(unit); 3630 } 3631 3632 if (allowed) { 3633 const nf = new Intl.NumberFormat("en", {style: "unit", unit}); 3634 assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join("")); 3635 } else { 3636 assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}), 3637 RangeError, `Missing error for "${typeAndUnit}"`); 3638 } 3639 }""", 3640 ) 3641 3642 write_test( 3643 "unit-formatToParts-has-unit-field.js", 3644 """ 3645 // Test only English and Chinese to keep the overall runtime reasonable. 3646 // 3647 // Chinese is included because it contains more than one "unit" element for 3648 // certain unit combinations. 3649 const locales = ["en", "zh"]; 3650 3651 // Plural rules for English only differentiate between "one" and "other". Plural 3652 // rules for Chinese only use "other". That means we only need to test two values 3653 // per unit. 3654 const values = [0, 1]; 3655 3656 // Ensure unit formatters contain at least one "unit" element. 3657 3658 for (const locale of locales) { 3659 for (const unit of sanctionedSimpleUnitIdentifiers) { 3660 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3661 3662 for (const value of values) { 3663 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3664 `locale=${locale}, unit=${unit}`); 3665 } 3666 } 3667 3668 for (const numerator of sanctionedSimpleUnitIdentifiers) { 3669 for (const denominator of sanctionedSimpleUnitIdentifiers) { 3670 const unit = `${numerator}-per-${denominator}`; 3671 const nf = new Intl.NumberFormat(locale, {style: "unit", unit}); 3672 3673 for (const value of values) { 3674 assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true, 3675 `locale=${locale}, unit=${unit}`); 3676 } 3677 } 3678 } 3679 }""", 3680 indent=2, 3681 ) 3682 3683 3684 def updateUnits(topsrcdir, args): 3685 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3686 icu_path = os.path.join(topsrcdir, "intl", "icu") 3687 icu_unit_path = os.path.join(icu_path, "source", "data", "unit") 3688 3689 with open( 3690 os.path.join(js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiers.yaml"), 3691 encoding="utf-8", 3692 ) as f: 3693 sanctioned_units = yaml.safe_load(f) 3694 3695 # Read all possible ICU unit identifiers from the "unit/root.txt" resource. 3696 unit_root_file = os.path.join(icu_unit_path, "root.txt") 3697 all_units = readICUUnitResourceFile(unit_root_file) 3698 3699 # Compute the set of effectively supported ICU unit identifiers. 3700 supported_units = computeSupportedUnits(all_units, sanctioned_units) 3701 3702 # Read the list of units we're including into the ICU data file. 3703 data_filter_file = os.path.join(icu_path, "data_filter.json") 3704 filtered_units = readICUDataFilterForUnits(data_filter_file) 3705 3706 # Both sets must match to avoid resource loading errors at runtime. 3707 if supported_units != filtered_units: 3708 3709 def units_to_string(units): 3710 return ", ".join("/".join(u) for u in units) 3711 3712 missing = supported_units - filtered_units 3713 if missing: 3714 raise RuntimeError(f"Missing units: {units_to_string(missing)}") 3715 3716 # Not exactly an error, but we currently don't have a use case where we need to support 3717 # more units than required by ECMA-402. 3718 extra = filtered_units - supported_units 3719 if extra: 3720 raise RuntimeError(f"Unnecessary units: {units_to_string(extra)}") 3721 3722 writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units) 3723 3724 3725 def readICUNumberingSystemsResourceFile(filepath): 3726 """Returns a dictionary of numbering systems where the key denotes the numbering system name 3727 and the value a dictionary with additional numbering system data. 3728 3729 Example: 3730 3731 numberingSystems:table(nofallback){ 3732 numberingSystems{ 3733 latn{ 3734 algorithmic:int{0} 3735 desc{"0123456789"} 3736 radix:int{10} 3737 } 3738 roman{ 3739 algorithmic:int{1} 3740 desc{"%roman-upper"} 3741 radix:int{10} 3742 } 3743 } 3744 } 3745 3746 Returns {"latn": {"digits": "0123456789", "algorithmic": False}, 3747 "roman": {"algorithmic": True}} 3748 """ 3749 3750 start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$") 3751 end_table_re = re.compile(r"^\}$") 3752 table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$") 3753 3754 # The current resource table. 3755 table = {} 3756 3757 # List of parent tables when parsing. 3758 parents = [] 3759 3760 # Track multi-line comments state. 3761 in_multiline_comment = False 3762 3763 for line in flines(filepath, "utf-8-sig"): 3764 # Remove leading and trailing whitespace. 3765 line = line.strip() 3766 3767 # Skip over comments. 3768 if in_multiline_comment: 3769 if line.endswith("*/"): 3770 in_multiline_comment = False 3771 continue 3772 3773 if line.startswith("//"): 3774 continue 3775 3776 if line.startswith("/*"): 3777 in_multiline_comment = True 3778 continue 3779 3780 # Try to match the start of a table, e.g. `latn{`. 3781 match = start_table_re.match(line) 3782 if match: 3783 parents.append(table) 3784 table_name = match.group(1) 3785 new_table = {} 3786 table[table_name] = new_table 3787 table = new_table 3788 continue 3789 3790 # Try to match the end of a table. 3791 match = end_table_re.match(line) 3792 if match: 3793 table = parents.pop() 3794 continue 3795 3796 # Try to match a table entry, e.g. `desc{"0123456789"}`. 3797 match = table_entry_re.match(line) 3798 if match: 3799 entry_key = match.group(1) 3800 entry_value = ( 3801 match.group(2) if match.group(2) is not None else int(match.group(3)) 3802 ) 3803 table[entry_key] = entry_value 3804 continue 3805 3806 raise Exception(f"unexpected line: '{line}' in {filepath}") 3807 3808 assert len(parents) == 0, "Not all tables closed" 3809 assert len(table) == 1, "More than one root table" 3810 3811 # Remove the two top-level "numberingSystems" tables. 3812 (_, numbering_systems) = table.popitem() 3813 (_, numbering_systems) = numbering_systems.popitem() 3814 3815 # Assert all numbering systems use base 10. 3816 assert all(ns["radix"] == 10 for ns in numbering_systems.values()) 3817 3818 # Return the numbering systems. 3819 return { 3820 key: ( 3821 {"digits": value["desc"], "algorithmic": False} 3822 if not bool(value["algorithmic"]) 3823 else {"algorithmic": True} 3824 ) 3825 for (key, value) in numbering_systems.items() 3826 } 3827 3828 3829 def writeNumberingSystemFiles(numbering_systems): 3830 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3831 3832 numbering_systems_js_file = os.path.join( 3833 js_src_builtin_intl_dir, "NumberingSystemsGenerated.h" 3834 ) 3835 with open(numbering_systems_js_file, mode="w", encoding="utf-8", newline="") as f: 3836 println = partial(print, file=f) 3837 3838 println(generatedFileWarning) 3839 3840 println( 3841 """ 3842 /** 3843 * The list of numbering systems with simple digit mappings. 3844 */ 3845 3846 #ifndef builtin_intl_NumberingSystemsGenerated_h 3847 #define builtin_intl_NumberingSystemsGenerated_h 3848 """ 3849 ) 3850 3851 simple_numbering_systems = sorted( 3852 name 3853 for (name, value) in numbering_systems.items() 3854 if not value["algorithmic"] 3855 ) 3856 3857 println("// clang-format off") 3858 println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\") 3859 println( 3860 "{}".format( 3861 ", \\\n".join(f' "{name}"' for name in simple_numbering_systems) 3862 ) 3863 ) 3864 println("// clang-format on") 3865 println("") 3866 3867 println("#endif // builtin_intl_NumberingSystemsGenerated_h") 3868 3869 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3870 test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl") 3871 3872 intl_shell_js_file = os.path.join(test_dir, "shell.js") 3873 3874 with open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f: 3875 println = partial(print, file=f) 3876 3877 println(generatedFileWarning) 3878 3879 println( 3880 f""" 3881 // source: CLDR file common/bcp47/number.xml; version CLDR {readCLDRVersionFromICU()}. 3882 // https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml 3883 // https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml 3884 """.rstrip() 3885 ) 3886 3887 numbering_systems_object = json.dumps( 3888 numbering_systems, 3889 indent=2, 3890 separators=(",", ": "), 3891 sort_keys=True, 3892 ensure_ascii=False, 3893 ) 3894 println(f"const numberingSystems = {numbering_systems_object};") 3895 3896 3897 def updateNumberingSystems(topsrcdir, args): 3898 js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__)) 3899 icu_path = os.path.join(topsrcdir, "intl", "icu") 3900 icu_misc_path = os.path.join(icu_path, "source", "data", "misc") 3901 3902 with open( 3903 os.path.join(js_src_builtin_intl_dir, "NumberingSystems.yaml"), 3904 encoding="utf-8", 3905 ) as f: 3906 numbering_systems = yaml.safe_load(f) 3907 3908 # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource. 3909 misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt") 3910 all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file) 3911 3912 all_numbering_systems_simple_digits = { 3913 name 3914 for (name, value) in all_numbering_systems.items() 3915 if not value["algorithmic"] 3916 } 3917 3918 # Assert ICU includes support for all required numbering systems. If this assertion fails, 3919 # something is broken in ICU. 3920 assert all_numbering_systems_simple_digits.issuperset(numbering_systems), ( 3921 f"{numbering_systems.difference(all_numbering_systems_simple_digits)}" 3922 ) 3923 3924 # Assert the spec requires support for all numbering systems with simple digit mappings. If 3925 # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new 3926 # numbering systems. 3927 assert all_numbering_systems_simple_digits.issubset(numbering_systems), ( 3928 f"{all_numbering_systems_simple_digits.difference(numbering_systems)}" 3929 ) 3930 3931 writeNumberingSystemFiles(all_numbering_systems) 3932 3933 3934 if __name__ == "__main__": 3935 import argparse 3936 3937 # This script must reside in js/src/builtin/intl to work correctly. 3938 (thisDir, thisFile) = os.path.split(os.path.abspath(__file__)) 3939 dirPaths = os.path.normpath(thisDir).split(os.sep) 3940 if "/".join(dirPaths[-4:]) != "js/src/builtin/intl": 3941 raise RuntimeError("%s must reside in js/src/builtin/intl" % __file__) 3942 topsrcdir = "/".join(dirPaths[:-4]) 3943 3944 def EnsureHttps(v): 3945 if not v.startswith("https:"): 3946 raise argparse.ArgumentTypeError("URL protocol must be https: " % v) 3947 return v 3948 3949 parser = argparse.ArgumentParser(description="Update intl data.") 3950 subparsers = parser.add_subparsers(help="Select update mode") 3951 3952 parser_cldr_tags = subparsers.add_parser( 3953 "langtags", help="Update CLDR language tags data" 3954 ) 3955 parser_cldr_tags.add_argument( 3956 "--version", metavar="VERSION", help="CLDR version number" 3957 ) 3958 parser_cldr_tags.add_argument( 3959 "--url", 3960 metavar="URL", 3961 default="https://unicode.org/Public/cldr/<VERSION>/cldr-common-<VERSION>.zip", 3962 type=EnsureHttps, 3963 help="Download url CLDR data (default: %(default)s)", 3964 ) 3965 parser_cldr_tags.add_argument( 3966 "--out", 3967 default=os.path.join( 3968 topsrcdir, "intl", "components", "src", "LocaleGenerated.cpp" 3969 ), 3970 help="Output file (default: %(default)s)", 3971 ) 3972 parser_cldr_tags.add_argument( 3973 "file", nargs="?", help="Local cldr-common.zip file, if omitted uses <URL>" 3974 ) 3975 parser_cldr_tags.set_defaults(func=updateCLDRLangTags) 3976 3977 parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") 3978 parser_tz.add_argument( 3979 "--tz", 3980 help="Local tzdata directory or file, if omitted downloads tzdata " 3981 "distribution from https://www.iana.org/time-zones/", 3982 ) 3983 parser_tz.add_argument( 3984 "--out", 3985 default=os.path.join(thisDir, "TimeZoneDataGenerated.h"), 3986 help="Output file (default: %(default)s)", 3987 ) 3988 parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir)) 3989 3990 parser_currency = subparsers.add_parser( 3991 "currency", help="Update currency digits mapping" 3992 ) 3993 parser_currency.add_argument( 3994 "--url", 3995 metavar="URL", 3996 default="https://www.six-group.com/dam/download/financial-information/data-center/iso-currrency/lists/list-one.xml", # NOQA: E501 3997 type=EnsureHttps, 3998 help="Download url for the currency & funds code list (default: %(default)s)", 3999 ) 4000 parser_currency.add_argument( 4001 "--out", 4002 default=os.path.join(thisDir, "CurrencyDataGenerated.js"), 4003 help="Output file (default: %(default)s)", 4004 ) 4005 parser_currency.add_argument( 4006 "file", nargs="?", help="Local currency code list file, if omitted uses <URL>" 4007 ) 4008 parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir)) 4009 4010 parser_units = subparsers.add_parser( 4011 "units", help="Update sanctioned unit identifiers mapping" 4012 ) 4013 parser_units.set_defaults(func=partial(updateUnits, topsrcdir)) 4014 4015 parser_numbering_systems = subparsers.add_parser( 4016 "numbering", help="Update numbering systems with simple digit mappings" 4017 ) 4018 parser_numbering_systems.set_defaults( 4019 func=partial(updateNumberingSystems, topsrcdir) 4020 ) 4021 4022 args = parser.parse_args() 4023 args.func(args)