make_unicode.py (50312B)
1 #!/usr/bin/env python 2 # Based upon makeunicodedata.py 3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py) 4 # written by Fredrik Lundh (fredrik@pythonware.com) 5 # 6 # Copyright (C) 2011 Tom Schuster <evilpies@gmail.com> 7 # 8 # This program is free software: you can redistribute it and/or modify 9 # it under the terms of the GNU General Public License as published by 10 # the Free Software Foundation, either version 3 of the License, or 11 # (at your option) any later version. 12 # 13 # This program is distributed in the hope that it will be useful, 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 # GNU General Public License for more details. 17 # 18 # You should have received a copy of the GNU General Public License 19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 20 21 import csv 22 import io 23 import os 24 import re 25 import sys 26 from contextlib import closing 27 from functools import partial 28 from itertools import chain, tee, zip_longest 29 from operator import is_not, itemgetter 30 from urllib.request import urlopen 31 from zipfile import ZipFile 32 33 34 class codepoint_dict(dict): 35 def name(self, code_point): 36 (_, _, name, alias) = self[code_point] 37 return "{}{}".format(name, (" (" + alias + ")" if alias else "")) 38 39 def full_name(self, code_point): 40 (_, _, name, alias) = self[code_point] 41 return "U+{:04X} {}{}".format( 42 code_point, name, (" (" + alias + ")" if alias else "") 43 ) 44 45 46 # ECMAScript 2016 47 # §11.2 White Space 48 whitespace = [ 49 # python doesn't support using control character names :( 50 0x9, # CHARACTER TABULATION 51 0xB, # LINE TABULATION 52 0xC, # FORM FEED 53 ord("\N{SPACE}"), 54 ord("\N{NO-BREAK SPACE}"), 55 ord("\N{ZERO WIDTH NO-BREAK SPACE}"), # also BOM 56 ] 57 58 # §11.3 Line Terminators 59 line_terminator = [ 60 0xA, # LINE FEED 61 0xD, # CARRIAGE RETURN 62 ord("\N{LINE SEPARATOR}"), 63 ord("\N{PARAGRAPH SEPARATOR}"), 64 ] 65 66 # These are also part of IdentifierPart §11.6 Names and Keywords 67 compatibility_identifier_part = [ 68 ord("\N{ZERO WIDTH NON-JOINER}"), 69 ord("\N{ZERO WIDTH JOINER}"), 70 ] 71 72 FLAG_SPACE = 1 << 0 73 FLAG_UNICODE_ID_START = 1 << 1 74 FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2 75 76 MAX_BMP = 0xFFFF 77 78 public_domain = """ 79 /* 80 * Any copyright is dedicated to the Public Domain. 81 * http://creativecommons.org/licenses/publicdomain/ 82 */ 83 """ 84 85 mpl_license = """\ 86 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 87 * vim: set ts=8 sts=4 et sw=4 tw=99: 88 * This Source Code Form is subject to the terms of the Mozilla Public 89 * License, v. 2.0. If a copy of the MPL was not distributed with this 90 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 91 """ 92 93 warning_message = """\ 94 /* Generated by make_unicode.py DO NOT MODIFY */ 95 """ 96 97 unicode_version_message = """\ 98 /* Unicode version: {0} */ 99 """ 100 101 102 def read_unicode_data(unicode_data): 103 """ 104 If you want to understand how this wonderful file format works checkout 105 Unicode Standard Annex #44 - Unicode Character Database 106 http://www.unicode.org/reports/tr44/ 107 """ 108 109 reader = csv.reader(unicode_data, delimiter=";") 110 111 while True: 112 row = next(reader, None) 113 if row is None: 114 return 115 name = row[1] 116 117 # We need to expand the UAX #44 4.2.3 Code Point Range 118 if name.startswith("<") and name.endswith("First>"): 119 next_row = next(reader) 120 121 for i in range(int(row[0], 16), int(next_row[0], 16) + 1): 122 row[0] = i 123 row[1] = name[1:-8] 124 125 yield row 126 else: 127 row[0] = int(row[0], 16) 128 yield row 129 130 131 def read_case_folding(case_folding): 132 """ 133 File format is: 134 <code>; <status>; <mapping>; # <name> 135 """ 136 137 for line in case_folding: 138 if line == "\n" or line.startswith("#"): 139 continue 140 row = line.split("; ") 141 if row[1] in ["F", "T"]: 142 continue 143 assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings" 144 code = int(row[0], 16) 145 mapping = int(row[2], 16) 146 yield (code, mapping) 147 148 149 def read_derived_core_properties(derived_core_properties): 150 for line in derived_core_properties: 151 if line == "\n" or line.startswith("#"): 152 continue 153 row = line.split("#")[0].split(";") 154 char_range = row[0].strip() 155 char_property = row[1].strip() 156 if ".." not in char_range: 157 yield (int(char_range, 16), char_property) 158 else: 159 [start, end] = char_range.split("..") 160 for char in range(int(start, 16), int(end, 16) + 1): 161 yield (char, char_property) 162 163 164 def read_special_casing(special_casing): 165 # Format: 166 # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> 167 for line in special_casing: 168 if line == "\n" or line.startswith("#"): 169 continue 170 row = line.split("#")[0].split(";") 171 code = int(row[0].strip(), 16) 172 lower = row[1].strip() 173 lower = [int(c, 16) for c in lower.split(" ")] if lower else [] 174 upper = row[3].strip() 175 upper = [int(c, 16) for c in upper.split(" ")] if upper else [] 176 languages = [] 177 contexts = [] 178 condition = row[4].strip() 179 if condition: 180 for cond in condition.split(" "): 181 if cond[0].islower(): 182 languages.append(cond) 183 else: 184 contexts.append(cond) 185 pass 186 yield (code, lower, upper, languages, contexts) 187 188 189 def int_ranges(ints): 190 """Yields consecutive ranges (inclusive) from integer values.""" 191 (a, b) = tee(sorted(ints)) 192 start = next(b) 193 for curr, succ in zip_longest(a, b): 194 if curr + 1 != succ: 195 yield (start, curr) 196 start = succ 197 198 199 def utf16_encode(code): 200 NonBMPMin = 0x10000 201 LeadSurrogateMin = 0xD800 202 TrailSurrogateMin = 0xDC00 203 204 lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin 205 trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin 206 207 return lead, trail 208 209 210 def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table): 211 # Find continuous range in convert_map. 212 convert_list = [] 213 entry = None 214 for code in sorted(convert_map.keys()): 215 lead, trail = utf16_encode(code) 216 converted = convert_map[code] 217 diff = converted - code 218 219 if ( 220 entry 221 and code == entry["code"] + entry["length"] 222 and diff == entry["diff"] 223 and lead == entry["lead"] 224 ): 225 entry["length"] += 1 226 continue 227 228 entry = { 229 "code": code, 230 "diff": diff, 231 "length": 1, 232 "lead": lead, 233 "trail": trail, 234 } 235 convert_list.append(entry) 236 237 # Generate macro call for each range. 238 lines = [] 239 comment = [] 240 for entry in convert_list: 241 from_code = entry["code"] 242 to_code = entry["code"] + entry["length"] - 1 243 diff = entry["diff"] 244 245 lead = entry["lead"] 246 from_trail = entry["trail"] 247 to_trail = entry["trail"] + entry["length"] - 1 248 249 lines.append( 250 f" MACRO(0x{from_code:x}, 0x{to_code:x}, 0x{lead:x}, 0x{from_trail:x}, 0x{to_trail:x}, {diff:d})" 251 ) 252 comment.append( 253 f"// {codepoint_table.full_name(from_code)} .. {codepoint_table.full_name(to_code)}" 254 ) 255 256 out_file.write("\n".join(comment)) 257 out_file.write("\n") 258 out_file.write(f"#define FOR_EACH_NON_BMP_{name}(MACRO) \\\n") 259 out_file.write(" \\\n".join(lines)) 260 out_file.write("\n") 261 262 263 def process_derived_core_properties(derived_core_properties): 264 id_start = set() 265 id_continue = set() 266 267 for char, prop in read_derived_core_properties(derived_core_properties): 268 if prop == "ID_Start": 269 id_start.add(char) 270 if prop == "ID_Continue": 271 id_continue.add(char) 272 273 return (id_start, id_continue) 274 275 276 def process_unicode_data(unicode_data, derived_core_properties): 277 dummy = (0, 0, 0) 278 table = [dummy] 279 cache = {dummy: 0} 280 index = [0] * (MAX_BMP + 1) 281 282 codepoint_table = codepoint_dict() 283 test_space_table = [] 284 285 non_bmp_lower_map = {} 286 non_bmp_upper_map = {} 287 non_bmp_id_start_set = {} 288 non_bmp_id_cont_set = {} 289 non_bmp_space_set = {} 290 291 (id_start, id_continue) = process_derived_core_properties(derived_core_properties) 292 293 for row in read_unicode_data(unicode_data): 294 code = row[0] 295 name = row[1] 296 category = row[2] 297 alias = row[-5] 298 uppercase = row[-3] 299 lowercase = row[-2] 300 301 if uppercase: 302 upper = int(uppercase, 16) 303 else: 304 upper = code 305 306 if lowercase: 307 lower = int(lowercase, 16) 308 else: 309 lower = code 310 311 codepoint_table[code] = (upper, lower, name, alias) 312 313 if code > MAX_BMP: 314 if code != lower: 315 non_bmp_lower_map[code] = lower 316 if code != upper: 317 non_bmp_upper_map[code] = upper 318 if category == "Zs": 319 non_bmp_space_set[code] = 1 320 test_space_table.append(code) 321 if code in id_start: 322 non_bmp_id_start_set[code] = 1 323 if code in id_continue: 324 non_bmp_id_cont_set[code] = 1 325 continue 326 327 assert lower <= MAX_BMP and upper <= MAX_BMP 328 329 flags = 0 330 331 # we combine whitespace and lineterminators because in pratice we don't need them separated 332 if category == "Zs" or code in whitespace or code in line_terminator: 333 flags |= FLAG_SPACE 334 test_space_table.append(code) 335 336 # §11.6 (IdentifierStart) 337 if code in id_start: 338 flags |= FLAG_UNICODE_ID_START 339 340 # §11.6 (IdentifierPart) 341 elif code in id_continue or code in compatibility_identifier_part: 342 flags |= FLAG_UNICODE_ID_CONTINUE_ONLY 343 344 up_d = upper - code 345 low_d = lower - code 346 347 assert up_d > -65535 and up_d < 65535 348 assert low_d > -65535 and low_d < 65535 349 350 upper = up_d & 0xFFFF 351 lower = low_d & 0xFFFF 352 353 item = (upper, lower, flags) 354 355 i = cache.get(item) 356 if i is None: 357 assert item not in table 358 cache[item] = i = len(table) 359 table.append(item) 360 index[code] = i 361 362 return ( 363 table, 364 index, 365 non_bmp_lower_map, 366 non_bmp_upper_map, 367 non_bmp_space_set, 368 non_bmp_id_start_set, 369 non_bmp_id_cont_set, 370 codepoint_table, 371 test_space_table, 372 ) 373 374 375 def process_case_folding(case_folding): 376 folding_map = {} 377 rev_folding_map = {} 378 folding_dummy = (0,) 379 folding_table = [folding_dummy] 380 folding_cache = {folding_dummy: 0} 381 folding_index = [0] * (MAX_BMP + 1) 382 383 folding_tests = [] 384 folding_codes = set() 385 386 for code, mapping in read_case_folding(case_folding): 387 folding_map[code] = mapping 388 389 if mapping not in rev_folding_map: 390 rev_folding_map[mapping] = [code] 391 else: 392 rev_folding_map[mapping].append(code) 393 394 folding_codes.add(code) 395 folding_codes.add(mapping) 396 397 for code in sorted(folding_codes): 398 if code in folding_map: 399 folding = folding_map[code] 400 else: 401 folding = code 402 403 if code in rev_folding_map: 404 rev_folding = rev_folding_map[code] 405 elif folding in rev_folding_map: 406 rev_folding = [c for c in rev_folding_map[folding] if c != code] 407 else: 408 rev_folding = [] 409 410 if folding != code or rev_folding: 411 item = [code] 412 if folding != code: 413 item.append(folding) 414 folding_tests.append(item + rev_folding) 415 416 if code > MAX_BMP: 417 continue 418 419 folding_d = folding - code 420 421 assert folding_d > -65535 and folding_d < 65535 422 423 folding = folding_d & 0xFFFF 424 425 item = (folding,) 426 427 i = folding_cache.get(item) 428 if i is None: 429 assert item not in folding_table 430 folding_cache[item] = i = len(folding_table) 431 folding_table.append(item) 432 folding_index[code] = i 433 return (folding_table, folding_index, folding_tests) 434 435 436 def process_special_casing(special_casing, table, index): 437 # Unconditional special casing. 438 unconditional_tolower = {} 439 unconditional_toupper = {} 440 441 # Conditional special casing, language independent. 442 conditional_tolower = {} 443 conditional_toupper = {} 444 445 # Conditional special casing, language dependent. 446 lang_conditional_tolower = {} 447 lang_conditional_toupper = {} 448 449 def caseInfo(code): 450 (upper, lower, flags) = table[index[code]] 451 return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF) 452 453 for code, lower, upper, languages, contexts in read_special_casing(special_casing): 454 assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code 455 assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages 456 assert len(contexts) <= 1, ( 457 "Expected zero or one casing contexts: %s" % languages 458 ) 459 460 (default_lower, default_upper) = caseInfo(code) 461 special_lower = len(lower) != 1 or lower[0] != default_lower 462 special_upper = len(upper) != 1 or upper[0] != default_upper 463 464 # Invariant: If |code| has casing per UnicodeData.txt, then it also has 465 # casing rules in SpecialCasing.txt. 466 assert code == default_lower or len(lower) != 1 or code != lower[0] 467 assert code == default_upper or len(upper) != 1 or code != upper[0] 468 469 language = languages[0] if languages else None 470 context = contexts[0] if contexts else None 471 472 if not language and not context: 473 if special_lower: 474 unconditional_tolower[code] = lower 475 if special_upper: 476 unconditional_toupper[code] = upper 477 elif not language and context: 478 if special_lower: 479 conditional_tolower[code] = (lower, context) 480 if special_upper: 481 conditional_toupper[code] = (upper, context) 482 else: 483 if language not in lang_conditional_tolower: 484 lang_conditional_tolower[language] = {} 485 lang_conditional_toupper[language] = {} 486 if special_lower: 487 lang_conditional_tolower[language][code] = (lower, context) 488 if special_upper: 489 lang_conditional_toupper[language][code] = (upper, context) 490 491 # Certain special casing rules are inlined in jsstr.cpp, ensure these cases 492 # still match the current SpecialCasing.txt file. 493 def lowerCase(code): 494 (lower, _) = caseInfo(code) 495 return lower 496 497 def upperCase(code): 498 (_, upper) = caseInfo(code) 499 return upper 500 501 def ascii(char_dict): 502 return (ch for ch in char_dict.keys() if ch <= 0x7F) 503 504 def latin1(char_dict): 505 return (ch for ch in char_dict.keys() if ch <= 0xFF) 506 507 def is_empty(iterable): 508 return not any(True for _ in iterable) 509 510 def is_equals(iter1, iter2): 511 return all(x == y for (x, y) in zip_longest(iter1, iter2)) 512 513 # Ensure no ASCII characters have special case mappings. 514 assert is_empty(ascii(unconditional_tolower)) 515 assert is_empty(ascii(unconditional_toupper)) 516 assert is_empty(ascii(conditional_tolower)) 517 assert is_empty(ascii(conditional_toupper)) 518 519 # Ensure no Latin1 characters have special lower case mappings. 520 assert is_empty(latin1(unconditional_tolower)) 521 assert is_empty(latin1(conditional_tolower)) 522 523 # Ensure no Latin1 characters have conditional special upper case mappings. 524 assert is_empty(latin1(conditional_toupper)) 525 526 # Ensure U+00DF is the only Latin1 character with a special upper case mapping. 527 assert is_equals([0x00DF], latin1(unconditional_toupper)) 528 529 # Ensure U+0130 is the only character with a special lower case mapping. 530 assert is_equals([0x0130], unconditional_tolower) 531 532 # Ensure no characters have language independent conditional upper case mappings. 533 assert is_empty(conditional_toupper) 534 535 # Ensure U+03A3 is the only character with language independent conditional lower case mapping. 536 assert is_equals([0x03A3], conditional_tolower) 537 538 # Verify U+0130 and U+03A3 have simple lower case mappings. 539 assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3]) 540 541 # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings. 542 assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys())) 543 assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys())) 544 545 # Maximum case mapping length is three characters. 546 assert ( 547 max( 548 map( 549 len, 550 chain( 551 unconditional_tolower.values(), 552 unconditional_toupper.values(), 553 map(itemgetter(0), conditional_tolower.values()), 554 map(itemgetter(0), conditional_toupper.values()), 555 map( 556 itemgetter(0), 557 chain.from_iterable( 558 d.values() for d in lang_conditional_tolower.values() 559 ), 560 ), 561 map( 562 itemgetter(0), 563 chain.from_iterable( 564 d.values() for d in lang_conditional_toupper.values() 565 ), 566 ), 567 ), 568 ) 569 ) 570 <= 3 571 ) 572 573 # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms). 574 assert set([ 575 "After_I", 576 "After_Soft_Dotted", 577 "Final_Sigma", 578 "More_Above", 579 "Not_Before_Dot", 580 ]).issuperset( 581 set( 582 filter( 583 partial(is_not, None), 584 chain( 585 map(itemgetter(1), conditional_tolower.values()), 586 map(itemgetter(1), conditional_toupper.values()), 587 map( 588 itemgetter(1), 589 chain.from_iterable( 590 d.values() for d in lang_conditional_tolower.values() 591 ), 592 ), 593 map( 594 itemgetter(1), 595 chain.from_iterable( 596 d.values() for d in lang_conditional_toupper.values() 597 ), 598 ), 599 ), 600 ) 601 ) 602 ) 603 604 # Special casing for U+00DF (LATIN SMALL LETTER SHARP S). 605 assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [ 606 0x0053, 607 0x0053, 608 ] 609 610 # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE). 611 assert unconditional_tolower[0x0130] == [0x0069, 0x0307] 612 613 # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA). 614 assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == ( 615 [0x03C2], 616 "Final_Sigma", 617 ) 618 619 return (unconditional_tolower, unconditional_toupper) 620 621 622 def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table): 623 file_name = "UnicodeNonBMP.h" 624 with open(file_name, mode="w", encoding="utf-8") as non_bmp_file: 625 non_bmp_file.write(mpl_license) 626 non_bmp_file.write("\n") 627 non_bmp_file.write(warning_message) 628 non_bmp_file.write(unicode_version_message.format(version)) 629 non_bmp_file.write( 630 """ 631 #ifndef util_UnicodeNonBMP_h 632 #define util_UnicodeNonBMP_h 633 634 // |MACRO| receives the following arguments 635 // MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) 636 // FROM: code point where the range starts 637 // TO: code point where the range ends 638 // LEAD: common lead surrogate of FROM and TO 639 // TRAIL_FROM: trail surrogate of FROM 640 // TRAIL_FROM: trail surrogate of TO 641 // DIFF: the difference between the code point in the range and 642 // converted code point 643 644 """ 645 ) 646 647 make_non_bmp_convert_macro( 648 non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table 649 ) 650 non_bmp_file.write("\n") 651 make_non_bmp_convert_macro( 652 non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table 653 ) 654 655 non_bmp_file.write( 656 """ 657 #endif /* util_UnicodeNonBMP_h */ 658 """ 659 ) 660 661 662 def write_special_casing_methods(unconditional_toupper, codepoint_table, println): 663 def hexlit(n): 664 """Returns C++ hex-literal for |n|.""" 665 return f"0x{n:04X}" 666 667 def describe_range(ranges, depth): 668 indent = depth * " " 669 for start, end in ranges: 670 if start == end: 671 println(indent, f"// {codepoint_table.full_name(start)}") 672 else: 673 println( 674 indent, 675 f"// {codepoint_table.full_name(start)} .. {codepoint_table.full_name(end)}", 676 ) 677 678 def out_range(start, end): 679 """Tests if the input character isn't a member of the set {x | start <= x <= end}.""" 680 if start == end: 681 return f"ch != {hexlit(start)}" 682 return f"ch < {hexlit(start)} || ch > {hexlit(end)}" 683 684 def in_range(start, end, parenthesize=False): 685 """Tests if the input character is in the set {x | start <= x <= end}.""" 686 if start == end: 687 return f"ch == {hexlit(start)}" 688 (left, right) = ("(", ")") if parenthesize else ("", "") 689 return f"{left}ch >= {hexlit(start)} && ch <= {hexlit(end)}{right}" 690 691 def in_any_range(ranges, spaces): 692 """Tests if the input character is included in any of the given ranges.""" 693 lines = [[]] 694 for start, end in ranges: 695 expr = in_range(start, end, parenthesize=True) 696 line = " || ".join(lines[-1] + [expr]) 697 if len(line) < (100 - len(spaces) - len(" ||")): 698 lines[-1].append(expr) 699 else: 700 lines.append([expr]) 701 return f" ||\n{spaces}".join(" || ".join(t) for t in lines) 702 703 def write_range_accept(parent_list, child_list, depth): 704 """Accepts the input character if it matches any code unit in |child_list|.""" 705 (min_parent, max_parent) = (parent_list[0], parent_list[-1]) 706 (min_child, max_child) = (child_list[0], child_list[-1]) 707 assert min_child >= min_parent 708 assert max_child <= max_parent 709 indent = depth * " " 710 711 child_ranges = list(int_ranges(child_list)) 712 has_successor = max_child != max_parent 713 714 # If |child_list| is a contiguous list of code units, emit a simple 715 # range check: |min_child <= input <= max_child|. 716 if len(child_ranges) == 1: 717 describe_range(child_ranges, depth) 718 if has_successor: 719 println(indent, f"if (ch <= {hexlit(max_child)}) {{") 720 println(indent, f" return ch >= {hexlit(min_child)};") 721 println(indent, "}") 722 else: 723 println(indent, f"return {in_range(min_child, max_child)};") 724 return 725 726 # Otherwise create a disjunction over the subranges in |child_ranges|. 727 if not has_successor: 728 spaces = indent + len("return ") * " " 729 else: 730 spaces = indent + len(" return ") * " " 731 range_test_expr = in_any_range(child_ranges, spaces) 732 733 if min_child != min_parent: 734 println(indent, f"if (ch < {hexlit(min_child)}) {{") 735 println(indent, " return false;") 736 println(indent, "}") 737 738 # If there's no successor block, we can omit the |input <= max_child| check, 739 # because it was already checked when we emitted the parent range test. 740 if not has_successor: 741 describe_range(child_ranges, depth) 742 println(indent, f"return {range_test_expr};") 743 else: 744 println(indent, f"if (ch <= {hexlit(max_child)}) {{") 745 describe_range(child_ranges, depth + 1) 746 println(indent, f" return {range_test_expr};") 747 println(indent, "}") 748 749 def write_ChangesWhenUpperCasedSpecialCasing(): 750 """Checks if the input has a special upper case mapping.""" 751 println("bool") 752 println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)") 753 println("{") 754 755 assert unconditional_toupper, "|unconditional_toupper| is not empty" 756 757 # Sorted list of code units with special upper case mappings. 758 code_list = sorted(unconditional_toupper.keys()) 759 760 # Fail-fast if the input character isn't a special casing character. 761 println(f" if ({out_range(code_list[0], code_list[-1])}) {{") 762 println(" return false;") 763 println(" }") 764 765 for i in range(0, 16): 766 # Check if the input characters is in the range: 767 # |start_point <= input < end_point|. 768 start_point = i << 12 769 end_point = (i + 1) << 12 770 matches = [cu for cu in code_list if start_point <= cu < end_point] 771 772 # Skip empty ranges. 773 if not matches: 774 continue 775 776 # If |matches| consists of only a few characters, directly check 777 # the input against the characters in |matches|. 778 if len(matches) <= 8: 779 write_range_accept(code_list, matches, depth=1) 780 continue 781 782 # Otherwise split into further subranges. 783 784 # Only enter the if-block if the input is less-or-equals to the 785 # largest value in the current range. 786 is_last_block = matches[-1] == code_list[-1] 787 if not is_last_block: 788 println(f" if (ch <= {hexlit(matches[-1])}) {{") 789 else: 790 println(f" if (ch < {hexlit(matches[0])}) {{") 791 println(" return false;") 792 println(" }") 793 794 for j in range(0, 16): 795 inner_start = start_point + (j << 8) 796 inner_end = start_point + ((j + 1) << 8) 797 inner_matches = [cu for cu in matches if inner_start <= cu < inner_end] 798 799 if inner_matches: 800 d = 1 if is_last_block else 2 801 write_range_accept(matches, inner_matches, depth=d) 802 803 if not is_last_block: 804 println(" }") 805 806 println("}") 807 808 def write_LengthUpperCaseSpecialCasing(): 809 """Slow case: Special casing character was found, returns its mapping length.""" 810 println("size_t") 811 println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)") 812 println("{") 813 814 println(" switch(ch) {") 815 for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)): 816 println( 817 f" case {hexlit(code)}: return {len(converted)}; // {codepoint_table.name(code)}" 818 ) 819 println(" }") 820 println("") 821 println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");') 822 println(" return 0;") 823 824 println("}") 825 826 def write_AppendUpperCaseSpecialCasing(): 827 """Slow case: Special casing character was found, append its mapping characters.""" 828 println("void") 829 println( 830 "js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)" # NOQA: E501 831 ) 832 println("{") 833 834 println(" switch(ch) {") 835 for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)): 836 println(f" case {hexlit(code)}: // {codepoint_table.name(code)}") 837 for ch in converted: 838 println( 839 f" elements[(*index)++] = {hexlit(ch)}; // {codepoint_table.name(ch)}" 840 ) 841 println(" return;") 842 println(" }") 843 println("") 844 println(' MOZ_ASSERT_UNREACHABLE("Bad character input.");') 845 846 println("}") 847 848 write_ChangesWhenUpperCasedSpecialCasing() 849 println("") 850 write_LengthUpperCaseSpecialCasing() 851 println("") 852 write_AppendUpperCaseSpecialCasing() 853 854 855 def write_ascii_lookup_tables(table, index, write, println): 856 def is_id_compat(code): 857 return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}") 858 859 def is_id_start(code): 860 (upper, lower, flags) = table[index[code]] 861 return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code) 862 863 def is_id_continue(code): 864 (upper, lower, flags) = table[index[code]] 865 return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code) 866 867 def is_space(code): 868 (upper, lower, flags) = table[index[code]] 869 return flags & FLAG_SPACE 870 871 def write_entries(name, predicate): 872 println(f"const bool unicode::{name}[] = {{") 873 header = "".join(f"{x: <6}" for x in range(0, 10)).rstrip() 874 println(f"/* {header} */") 875 for i in range(0, 13): 876 write(f"/* {i: >2} */") 877 for j in range(0, 10): 878 code = i * 10 + j 879 if code <= 0x7F: 880 write(" {},".format("true" if predicate(code) else "____")) 881 println("") 882 println("};") 883 884 println("") 885 println("#define ____ false") 886 887 println( 888 """ 889 /* 890 * Identifier start chars: 891 * - 36: $ 892 * - 65..90: A..Z 893 * - 95: _ 894 * - 97..122: a..z 895 */""" 896 ) 897 write_entries("js_isidstart", is_id_start) 898 899 println( 900 """ 901 /* 902 * Identifier chars: 903 * - 36: $ 904 * - 48..57: 0..9 905 * - 65..90: A..Z 906 * - 95: _ 907 * - 97..122: a..z 908 */""" 909 ) 910 write_entries("js_isident", is_id_continue) 911 912 println( 913 """ 914 /* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */""" 915 ) 916 write_entries("js_isspace", is_space) 917 918 println("") 919 println("#undef ____") 920 921 922 def write_latin1_lookup_tables(table, index, write, println): 923 def case_info(code): 924 assert 0 <= code <= MAX_BMP 925 (upper, lower, flags) = table[index[code]] 926 return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags) 927 928 def toLowerCase(code): 929 (_, lower, _) = case_info(code) 930 assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1" 931 return lower 932 933 def write_entries(name, mapper): 934 println(f"const JS::Latin1Char unicode::{name}[] = {{") 935 header = "".join(f"{x: <6}" for x in range(0, 16)).rstrip() 936 println(f"/* {header} */") 937 for i in range(0, 16): 938 write(f"/* {i: >2} */") 939 for j in range(0, 16): 940 code = i * 16 + j 941 if code <= 0xFF: 942 write(f" 0x{mapper(code):02X},") 943 println("") 944 println("};") 945 946 println("") 947 write_entries("latin1ToLowerCaseTable", toLowerCase) 948 949 950 def make_bmp_mapping_test( 951 version, codepoint_table, unconditional_tolower, unconditional_toupper 952 ): 953 def unicodeEsc(n): 954 return f"\\u{n:04X}" 955 956 file_name = "../tests/non262/String/string-upper-lower-mapping.js" 957 with open(file_name, mode="w", encoding="utf-8") as output: 958 write = partial(print, file=output, sep="", end="") 959 println = partial(print, file=output, sep="", end="\n") 960 961 write(warning_message) 962 write(unicode_version_message.format(version)) 963 write(public_domain) 964 println("var mapping = [") 965 for code in range(0, MAX_BMP + 1): 966 entry = codepoint_table.get(code) 967 968 if entry: 969 (upper, lower, _, _) = entry 970 upper = ( 971 unconditional_toupper[code] 972 if code in unconditional_toupper 973 else [upper] 974 ) 975 lower = ( 976 unconditional_tolower[code] 977 if code in unconditional_tolower 978 else [lower] 979 ) 980 println( 981 ' ["{}", "{}"], /* {} */'.format( 982 "".join(map(unicodeEsc, upper)), 983 "".join(map(unicodeEsc, lower)), 984 codepoint_table.name(code), 985 ) 986 ) 987 else: 988 println(' ["{0}", "{0}"],'.format(unicodeEsc(code))) 989 println("];") 990 write( 991 """ 992 assertEq(mapping.length, 0x10000); 993 for (var i = 0; i <= 0xffff; i++) { 994 var char = String.fromCharCode(i); 995 var info = mapping[i]; 996 997 assertEq(char.toUpperCase(), info[0]); 998 assertEq(char.toLowerCase(), info[1]); 999 } 1000 1001 if (typeof reportCompare === "function") 1002 reportCompare(true, true); 1003 """ 1004 ) 1005 1006 1007 def make_non_bmp_mapping_test( 1008 version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table 1009 ): 1010 file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js" 1011 with open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping: 1012 test_non_bmp_mapping.write(warning_message) 1013 test_non_bmp_mapping.write(unicode_version_message.format(version)) 1014 test_non_bmp_mapping.write(public_domain) 1015 1016 for code in sorted(non_bmp_upper_map.keys()): 1017 test_non_bmp_mapping.write( 1018 f"""\ 1019 assertEq(String.fromCodePoint(0x{code:04X}).toUpperCase().codePointAt(0), 0x{non_bmp_upper_map[code]:04X}); // {codepoint_table.name(code)}, {codepoint_table.name(non_bmp_upper_map[code])} 1020 """ 1021 ) 1022 1023 for code in sorted(non_bmp_lower_map.keys()): 1024 test_non_bmp_mapping.write( 1025 f"""\ 1026 assertEq(String.fromCodePoint(0x{code:04X}).toLowerCase().codePointAt(0), 0x{non_bmp_lower_map[code]:04X}); // {codepoint_table.name(code)}, {codepoint_table.name(non_bmp_lower_map[code])} 1027 """ 1028 ) 1029 1030 test_non_bmp_mapping.write( 1031 """ 1032 if (typeof reportCompare === "function") 1033 reportCompare(true, true); 1034 """ 1035 ) 1036 1037 1038 def make_space_test(version, test_space_table, codepoint_table): 1039 def hex_and_name(c): 1040 return f" 0x{c:04X} /* {codepoint_table.name(c)} */" 1041 1042 file_name = "../tests/non262/String/string-space-trim.js" 1043 with open(file_name, mode="w", encoding="utf-8") as test_space: 1044 test_space.write(warning_message) 1045 test_space.write(unicode_version_message.format(version)) 1046 test_space.write(public_domain) 1047 test_space.write("var onlySpace = String.fromCharCode(\n") 1048 test_space.write(",\n".join(map(hex_and_name, test_space_table))) 1049 test_space.write("\n);\n") 1050 test_space.write( 1051 """ 1052 assertEq(onlySpace.trim(), ""); 1053 assertEq((onlySpace + 'aaaa').trim(), 'aaaa'); 1054 assertEq(('aaaa' + onlySpace).trim(), 'aaaa'); 1055 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa'); 1056 1057 if (typeof reportCompare === "function") 1058 reportCompare(true, true); 1059 """ 1060 ) 1061 1062 1063 def make_regexp_space_test(version, test_space_table, codepoint_table): 1064 def hex_and_name(c): 1065 return f" 0x{c:04X} /* {codepoint_table.name(c)} */" 1066 1067 file_name = "../tests/non262/RegExp/character-class-escape-s.js" 1068 with open(file_name, mode="w", encoding="utf-8") as test_space: 1069 test_space.write(warning_message) 1070 test_space.write(unicode_version_message.format(version)) 1071 test_space.write(public_domain) 1072 test_space.write("var onlySpace = String.fromCodePoint(\n") 1073 test_space.write(",\n".join(map(hex_and_name, test_space_table))) 1074 test_space.write("\n);\n") 1075 test_space.write( 1076 r""" 1077 assertEq(/^\s+$/.exec(onlySpace) !== null, true); 1078 assertEq(/^[\s]+$/.exec(onlySpace) !== null, true); 1079 assertEq(/^[^\s]+$/.exec(onlySpace) === null, true); 1080 1081 assertEq(/^\S+$/.exec(onlySpace) === null, true); 1082 assertEq(/^[\S]+$/.exec(onlySpace) === null, true); 1083 assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true); 1084 1085 // Also test with Unicode RegExps. 1086 assertEq(/^\s+$/u.exec(onlySpace) !== null, true); 1087 assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true); 1088 assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true); 1089 1090 assertEq(/^\S+$/u.exec(onlySpace) === null, true); 1091 assertEq(/^[\S]+$/u.exec(onlySpace) === null, true); 1092 assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true); 1093 1094 if (typeof reportCompare === "function") 1095 reportCompare(true, true); 1096 """ 1097 ) 1098 1099 1100 def make_icase_test(version, folding_tests, codepoint_table): 1101 def char_hex(c): 1102 return f"0x{c:04X}" 1103 1104 file_name = "../tests/non262/RegExp/unicode-ignoreCase.js" 1105 with open(file_name, mode="w", encoding="utf-8") as test_icase: 1106 test_icase.write("// |reftest| skip-if(!this.hasOwnProperty('Intl'))\n\n") 1107 test_icase.write(warning_message) 1108 test_icase.write(unicode_version_message.format(version)) 1109 test_icase.write(public_domain) 1110 test_icase.write( 1111 """ 1112 var BUGNUMBER = 1135377; 1113 var summary = "Implement RegExp unicode flag -- ignoreCase flag."; 1114 1115 print(BUGNUMBER + ": " + summary); 1116 1117 function test(code, ...equivs) { 1118 var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu"); 1119 var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join(""); 1120 assertEqArray(codeRe.exec("<" + ans + ">"), [ans]); 1121 codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu"); 1122 assertEqArray(codeRe.exec("<" + ans + ">"), [ans]); 1123 } 1124 """ 1125 ) 1126 for args in folding_tests: 1127 test_icase.write( 1128 "test({}); // {}\n".format( 1129 ", ".join(map(char_hex, args)), 1130 ", ".join(map(codepoint_table.name, args)), 1131 ) 1132 ) 1133 test_icase.write( 1134 """ 1135 if (typeof reportCompare === "function") 1136 reportCompare(true, true); 1137 """ 1138 ) 1139 1140 1141 def make_unicode_file( 1142 version, 1143 table, 1144 index, 1145 folding_table, 1146 folding_index, 1147 non_bmp_space_set, 1148 non_bmp_id_start_set, 1149 non_bmp_id_cont_set, 1150 unconditional_toupper, 1151 codepoint_table, 1152 ): 1153 index1, index2, shift = splitbins(index) 1154 1155 # Don't forget to update CharInfo in Unicode.h if you need to change this 1156 assert shift == 6 1157 1158 folding_index1, folding_index2, folding_shift = splitbins(folding_index) 1159 1160 # Don't forget to update CaseFoldInfo in Unicode.h if you need to change this 1161 assert folding_shift == 5 1162 1163 # verify correctness 1164 for char in index: 1165 test = table[index[char]] 1166 1167 idx = index1[char >> shift] 1168 idx = index2[(idx << shift) + (char & ((1 << shift) - 1))] 1169 1170 assert test == table[idx] 1171 1172 # verify correctness 1173 for char in folding_index: 1174 test = folding_table[folding_index[char]] 1175 1176 idx = folding_index1[char >> folding_shift] 1177 idx = folding_index2[ 1178 (idx << folding_shift) + (char & ((1 << folding_shift) - 1)) 1179 ] 1180 1181 assert test == folding_table[idx] 1182 1183 comment = """ 1184 /* 1185 * So how does indexing work? 1186 * First let's have a look at a char16_t, 16-bits: 1187 * [................] 1188 * Step 1: 1189 * Extracting the upper 11 bits from the char16_t. 1190 * upper = char >> 5 ([***********.....]) 1191 * Step 2: 1192 * Using these bits to get an reduced index from index1. 1193 * index = index1[upper] 1194 * Step 3: 1195 * Combining the index and the bottom 5 bits of the original char16_t. 1196 * real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++]) 1197 * 1198 * The advantage here is that the biggest number in index1 doesn't need 10 bits, 1199 * but 7 and we save some memory. 1200 * 1201 * Step 4: 1202 * Get the character informations by looking up real_index in js_charinfo. 1203 * 1204 * Pseudocode of generation: 1205 * 1206 * let table be the mapping of char16_t => js_charinfo_index 1207 * let index1 be an empty array 1208 * let index2 be an empty array 1209 * let cache be a hash map 1210 * 1211 * while shift is less then maximal amount you can shift 0xffff before it's 0 1212 * let chunks be table split in chunks of size 2**shift 1213 * 1214 * for every chunk in chunks 1215 * if chunk is in cache 1216 * let index be cache[chunk] 1217 * else 1218 * let index be the max key of index2 + 1 1219 * for element in chunk 1220 * push element to index2 1221 * put index as chunk in cache 1222 * 1223 * push index >> shift to index1 1224 * 1225 * increase shift 1226 * stop if you found the best shift 1227 */ 1228 """ 1229 1230 def dump(data, name, println): 1231 println(f"const uint8_t unicode::{name}[] = {{") 1232 1233 line = pad = " " * 4 1234 lines = [] 1235 for entry in data: 1236 assert entry < 256 1237 s = str(entry) 1238 s = s.rjust(3) 1239 1240 if len(line + s) + 5 > 99: 1241 lines.append(line.rstrip()) 1242 line = pad + s + ", " 1243 else: 1244 line = line + s + ", " 1245 lines.append(line.rstrip()) 1246 1247 println("\n".join(lines)) 1248 println("};") 1249 1250 def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println): 1251 println(f"const {data_type} unicode::{name}[] = {{") 1252 for d in tbl: 1253 println(" {{ {} }},".format(", ".join(str(e) for e in d))) 1254 println("};") 1255 println("") 1256 1257 dump(idx1, idx1_name, println) 1258 println("") 1259 dump(idx2, idx2_name, println) 1260 println("") 1261 1262 def write_supplemental_identifier_method(name, group_set, println): 1263 println("bool") 1264 println(f"js::unicode::{name}(char32_t codePoint)") 1265 println("{") 1266 for from_code, to_code in int_ranges(group_set.keys()): 1267 println( 1268 f" if (codePoint >= 0x{from_code:X} && codePoint <= 0x{to_code:X}) {{ // {codepoint_table.name(from_code)} .. {codepoint_table.name(to_code)}" 1269 ) 1270 println(" return true;") 1271 println(" }") 1272 println(" return false;") 1273 println("}") 1274 println("") 1275 1276 file_name = "Unicode.cpp" 1277 with open(file_name, "w", encoding="utf-8") as data_file: 1278 write = partial(print, file=data_file, sep="", end="") 1279 println = partial(print, file=data_file, sep="", end="\n") 1280 1281 write(warning_message) 1282 write(unicode_version_message.format(version)) 1283 write(public_domain) 1284 println('#include "util/Unicode.h"') 1285 println("") 1286 println("using namespace js;") 1287 println("using namespace js::unicode;") 1288 write(comment) 1289 1290 write_table( 1291 "CharacterInfo", 1292 "js_charinfo", 1293 table, 1294 "index1", 1295 index1, 1296 "index2", 1297 index2, 1298 println, 1299 ) 1300 1301 write_table( 1302 "FoldingInfo", 1303 "js_foldinfo", 1304 folding_table, 1305 "folding_index1", 1306 folding_index1, 1307 "folding_index2", 1308 folding_index2, 1309 println, 1310 ) 1311 1312 # If the following assert fails, it means space character is added to 1313 # non-BMP area. In that case the following code should be uncommented 1314 # and the corresponding code should be added to frontend. (At least 1315 # unicode::IsSpace will require updating to handle this.) 1316 assert len(non_bmp_space_set.keys()) == 0 1317 1318 write_supplemental_identifier_method( 1319 "IsIdentifierStartNonBMP", non_bmp_id_start_set, println 1320 ) 1321 1322 write_supplemental_identifier_method( 1323 "IsIdentifierPartNonBMP", non_bmp_id_cont_set, println 1324 ) 1325 1326 write_special_casing_methods(unconditional_toupper, codepoint_table, println) 1327 1328 write_ascii_lookup_tables(table, index, write, println) 1329 1330 write_latin1_lookup_tables(table, index, write, println) 1331 1332 1333 def getsize(data): 1334 """return smallest possible integer size for the given array""" 1335 maxdata = max(data) 1336 assert maxdata < 2**32 1337 1338 if maxdata < 256: 1339 return 1 1340 elif maxdata < 65536: 1341 return 2 1342 else: 1343 return 4 1344 1345 1346 def splitbins(t): 1347 """t -> (t1, t2, shift). Split a table to save space. 1348 1349 t is a sequence of ints. This function can be useful to save space if 1350 many of the ints are the same. t1 and t2 are lists of ints, and shift 1351 is an int, chosen to minimize the combined size of t1 and t2 (in C 1352 code), and where for each i in range(len(t)), 1353 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1354 where mask is a bitmask isolating the last "shift" bits. 1355 """ 1356 1357 def dump(t1, t2, shift, bytes): 1358 print( 1359 "%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes), 1360 file=sys.stderr, 1361 ) 1362 print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr) 1363 1364 n = len(t) - 1 # last valid index 1365 maxshift = 0 # the most we can shift n and still have something left 1366 if n > 0: 1367 while n >> 1: 1368 n >>= 1 1369 maxshift += 1 1370 del n 1371 bytes = sys.maxsize # smallest total size so far 1372 t = tuple(t) # so slices can be dict keys 1373 for shift in range(maxshift + 1): 1374 t1 = [] 1375 t2 = [] 1376 size = 2**shift 1377 bincache = {} 1378 1379 for i in range(0, len(t), size): 1380 bin = t[i : i + size] 1381 1382 index = bincache.get(bin) 1383 if index is None: 1384 index = len(t2) 1385 bincache[bin] = index 1386 t2.extend(bin) 1387 t1.append(index >> shift) 1388 1389 # determine memory size 1390 b = len(t1) * getsize(t1) + len(t2) * getsize(t2) 1391 if b < bytes: 1392 best = t1, t2, shift 1393 bytes = b 1394 t1, t2, shift = best 1395 1396 print("Best:", end=" ", file=sys.stderr) 1397 dump(t1, t2, shift, bytes) 1398 1399 # exhaustively verify that the decomposition is correct 1400 mask = 2**shift - 1 1401 for i in range(len(t)): 1402 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1403 return best 1404 1405 1406 def update_unicode(args): 1407 base_path = os.getcwd() 1408 1409 version = args.version 1410 if version is not None: 1411 baseurl = "https://unicode.org/Public" 1412 if version == "UNIDATA": 1413 url = "%s/%s" % (baseurl, version) 1414 else: 1415 url = "%s/%s/ucd" % (baseurl, version) 1416 1417 print("Arguments:") 1418 if version is not None: 1419 print("\tVersion: %s" % version) 1420 print("\tDownload url: %s" % url) 1421 1422 request_url = f"{url}/UCD.zip" 1423 with closing(urlopen(request_url)) as downloaded_file: 1424 downloaded_data = io.BytesIO(downloaded_file.read()) 1425 1426 with ZipFile(downloaded_data) as zip_file: 1427 for fname in [ 1428 "UnicodeData.txt", 1429 "CaseFolding.txt", 1430 "DerivedCoreProperties.txt", 1431 "SpecialCasing.txt", 1432 ]: 1433 zip_file.extract(fname, path=base_path) 1434 else: 1435 print("\tUsing local files.") 1436 print("\tAlways make sure you have the newest Unicode files!") 1437 print("") 1438 1439 def version_from_file(f, fname): 1440 pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname) 1441 return pat_version.match(f.readline()).group("version") 1442 1443 with open( 1444 os.path.join(base_path, "UnicodeData.txt"), encoding="utf-8" 1445 ) as unicode_data, open( 1446 os.path.join(base_path, "CaseFolding.txt"), encoding="utf-8" 1447 ) as case_folding, open( 1448 os.path.join(base_path, "DerivedCoreProperties.txt"), encoding="utf-8" 1449 ) as derived_core_properties, open( 1450 os.path.join(base_path, "SpecialCasing.txt"), encoding="utf-8" 1451 ) as special_casing: 1452 unicode_version = version_from_file( 1453 derived_core_properties, "DerivedCoreProperties" 1454 ) 1455 1456 print("Processing...") 1457 ( 1458 table, 1459 index, 1460 non_bmp_lower_map, 1461 non_bmp_upper_map, 1462 non_bmp_space_set, 1463 non_bmp_id_start_set, 1464 non_bmp_id_cont_set, 1465 codepoint_table, 1466 test_space_table, 1467 ) = process_unicode_data(unicode_data, derived_core_properties) 1468 (folding_table, folding_index, folding_tests) = process_case_folding( 1469 case_folding 1470 ) 1471 (unconditional_tolower, unconditional_toupper) = process_special_casing( 1472 special_casing, table, index 1473 ) 1474 1475 print("Generating...") 1476 make_unicode_file( 1477 unicode_version, 1478 table, 1479 index, 1480 folding_table, 1481 folding_index, 1482 non_bmp_space_set, 1483 non_bmp_id_start_set, 1484 non_bmp_id_cont_set, 1485 unconditional_toupper, 1486 codepoint_table, 1487 ) 1488 make_non_bmp_file( 1489 unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table 1490 ) 1491 1492 make_bmp_mapping_test( 1493 unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper 1494 ) 1495 make_non_bmp_mapping_test( 1496 unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table 1497 ) 1498 make_space_test(unicode_version, test_space_table, codepoint_table) 1499 make_regexp_space_test(unicode_version, test_space_table, codepoint_table) 1500 make_icase_test(unicode_version, folding_tests, codepoint_table) 1501 1502 1503 if __name__ == "__main__": 1504 import argparse 1505 1506 # This script must be run from js/src/util to work correctly. 1507 if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util": 1508 raise RuntimeError("%s must be run from js/src/util" % sys.argv[0]) 1509 1510 parser = argparse.ArgumentParser(description="Update Unicode data.") 1511 1512 parser.add_argument( 1513 "--version", 1514 help='Optional Unicode version number. If specified, downloads the\ 1515 selected version from <https://unicode.org/Public>. If not specified\ 1516 uses the existing local files to generate the Unicode data. The\ 1517 number must match a published Unicode version, e.g. use\ 1518 "--version=8.0.0" to download Unicode 8 files. Alternatively use\ 1519 "--version=UNIDATA" to download the latest published version.', 1520 ) 1521 1522 parser.set_defaults(func=update_unicode) 1523 1524 args = parser.parse_args() 1525 args.func(args)