base_chars.py (5530B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 import re 6 from collections import namedtuple 7 from unicodedata import category, combining, normalize 8 9 UNICODE_LIMIT = 0x110000 10 11 UNICODE_COMBINING_CLASS_NOT_REORDERED = 0 12 UNICODE_COMBINING_CLASS_KANA_VOICING = 8 13 UNICODE_COMBINING_CLASS_VIRAMA = 9 14 15 BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char")) 16 BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset")) 17 18 19 # Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h. 20 def is_combining_diacritic(char): 21 return combining(char) not in ( 22 UNICODE_COMBINING_CLASS_NOT_REORDERED, 23 UNICODE_COMBINING_CLASS_KANA_VOICING, 24 UNICODE_COMBINING_CLASS_VIRAMA, 25 91, 26 129, 27 130, 28 132, 29 ) 30 31 32 # Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h. 33 def is_math_or_music_symbol(char): 34 return category(char) in ("Sm", "So") 35 36 37 def changes_plane(char, base_char): 38 # Mappings that would change the first 16 bits of a character are not 39 # currently supported. This is because the mapping table only records the 40 # last 16 bits of the base character and also because moving into or out of 41 # the basic multilingual plane would change the length of a UTF-16 string. 42 return ord(char) >> 16 != ord(base_char) >> 16 43 44 45 def main(header, fallback_table): 46 mappings = {} 47 48 # Glean mappings from decompositions 49 50 for char in range(UNICODE_LIMIT): 51 char = chr(char) 52 if is_combining_diacritic(char) or is_math_or_music_symbol(char): 53 continue 54 decomposition = normalize("NFD", char) 55 if len(decomposition) < 2: 56 continue 57 base_char = decomposition[0] 58 if changes_plane(char, base_char): 59 continue 60 next_char = decomposition[1] 61 if not is_combining_diacritic(next_char): 62 # Hangul syllables decompose but do not actually have diacritics. 63 # This also excludes decompositions with the Japanese marks U+3099 64 # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND 65 # MARK), which we should not ignore for searching (bug 1624244). 66 continue 67 mappings[char] = base_char 68 69 # Add mappings from the ASCII fallback table 70 71 for line in open(fallback_table, encoding="UTF-8"): 72 m = re.match("^(.) → (.+?) ;", line) 73 if not m: 74 continue 75 char = m.group(1) 76 decomposition = m.group(2) 77 if len(decomposition) >= 3: 78 if decomposition.startswith("'") and decomposition.endswith("'"): 79 decomposition = decomposition[1:-1] 80 if len(decomposition) >= 2: 81 if decomposition.startswith("\\"): 82 decomposition = decomposition[1:] 83 if len(decomposition) > 1: 84 continue 85 if changes_plane(char, decomposition): 86 continue 87 mappings[char] = decomposition 88 89 # Organize mappings into contiguous blocks 90 91 mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()]) 92 blocks = [] 93 i = 0 94 while i < len(mappings) - 1: 95 offset = i 96 first = mappings[i].char & 0xFF 97 while ( 98 i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8 99 ): 100 while ( 101 i < len(mappings) - 1 102 and mappings[i].char >> 8 == mappings[i + 1].char >> 8 103 and mappings[i + 1].char - mappings[i].char > 1 104 ): 105 char = mappings[i].char + 1 106 mappings.insert(i + 1, BaseCharMapping(char, char)) 107 i += 1 108 i += 1 109 last = mappings[i].char & 0xFF 110 blocks.append(BaseCharMappingBlock(first, last, offset)) 111 i += 1 112 113 indexes = [] 114 for i, block in enumerate(blocks): 115 while len(indexes) < mappings[block.offset].char >> 8: 116 indexes.append(255) 117 indexes.append(i) 118 119 # Write the mappings to a C header file 120 121 header.write("struct BaseCharMappingBlock {\n") 122 header.write(" uint8_t mFirst;\n") 123 header.write(" uint8_t mLast;\n") 124 header.write(" uint16_t mMappingStartOffset;\n") 125 header.write("};\n") 126 header.write("\n") 127 header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n") 128 for char, base_char in mappings: 129 header.write(f" /* {char:#06x}" + " */ " + f"{base_char & 0xFFFF:#06x}" + ",") 130 if char != base_char: 131 header.write(" /* " + chr(char) + " → " + chr(base_char) + " */") 132 header.write("\n") 133 header.write("};\n") 134 header.write("\n") 135 header.write( 136 "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n" 137 ) 138 for block in blocks: 139 header.write( 140 " {" 141 + f"{block.first:#04x}" 142 + ", " 143 + f"{block.last:#04x}" 144 + ", " 145 + str(block.offset).rjust(4) 146 + "}, // " 147 + f"{mappings[block.offset].char >> 8:#04x}" 148 + "xx\n" 149 ) 150 header.write("};\n") 151 header.write("\n") 152 header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n") 153 for i, index in enumerate(indexes): 154 header.write(" " + str(index).rjust(3) + ", // " + f"{i:#04x}" + "xx\n") 155 header.write("};\n")