gen-tag-table.py (37607B)
1 #!/usr/bin/env python3 2 3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice 4 versa. 5 6 It creates a ``const LangTag[]``, matching the tags from the OpenType 7 languages system tag list to the language subtags of the BCP 47 language 8 subtag registry, with some manual adjustments. The mappings are 9 supplemented with macrolanguages' sublanguages and retired codes' 10 replacements, according to BCP 47 and some manual additions where BCP 47 11 omits a retired code entirely. 12 13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``, 14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags 15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to 16 multiple BCP 47 tags) are listed here, except when the alphabetically 17 first BCP 47 tag happens to be the chosen disambiguated tag. In that 18 case, the fallback behavior will choose the right tag anyway. 19 20 usage: ./gen-tag-table.py languagetags language-subtag-registry 21 22 Input files: 23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags 24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry 25 """ 26 27 import collections 28 import html 29 from html.parser import HTMLParser 30 import itertools 31 import re 32 import sys 33 import unicodedata 34 35 if len (sys.argv) != 3: 36 sys.exit (__doc__) 37 38 def expect (condition, message=None): 39 if not condition: 40 if message is None: 41 raise AssertionError 42 raise AssertionError (message) 43 44 def write (s): 45 sys.stdout.flush () 46 sys.stdout.buffer.write (s.encode ('utf-8')) 47 48 DEFAULT_LANGUAGE_SYSTEM = '' 49 50 # from https://www-01.sil.org/iso639-3/iso-639-3.tab 51 ISO_639_3_TO_1 = { 52 'aar': 'aa', 53 'abk': 'ab', 54 'afr': 'af', 55 'aka': 'ak', 56 'amh': 'am', 57 'ara': 'ar', 58 'arg': 'an', 59 'asm': 'as', 60 'ava': 'av', 61 'ave': 'ae', 62 'aym': 'ay', 63 'aze': 'az', 64 'bak': 'ba', 65 'bam': 'bm', 66 'bel': 'be', 67 'ben': 'bn', 68 'bis': 'bi', 69 'bod': 'bo', 70 'bos': 'bs', 71 'bre': 'br', 72 'bul': 'bg', 73 'cat': 'ca', 74 'ces': 'cs', 75 'cha': 'ch', 76 'che': 'ce', 77 'chu': 'cu', 78 'chv': 'cv', 79 'cor': 'kw', 80 'cos': 'co', 81 'cre': 'cr', 82 'cym': 'cy', 83 'dan': 'da', 84 'deu': 'de', 85 'div': 'dv', 86 'dzo': 'dz', 87 'ell': 'el', 88 'eng': 'en', 89 'epo': 'eo', 90 'est': 'et', 91 'eus': 'eu', 92 'ewe': 'ee', 93 'fao': 'fo', 94 'fas': 'fa', 95 'fij': 'fj', 96 'fin': 'fi', 97 'fra': 'fr', 98 'fry': 'fy', 99 'ful': 'ff', 100 'gla': 'gd', 101 'gle': 'ga', 102 'glg': 'gl', 103 'glv': 'gv', 104 'grn': 'gn', 105 'guj': 'gu', 106 'hat': 'ht', 107 'hau': 'ha', 108 'hbs': 'sh', 109 'heb': 'he', 110 'her': 'hz', 111 'hin': 'hi', 112 'hmo': 'ho', 113 'hrv': 'hr', 114 'hun': 'hu', 115 'hye': 'hy', 116 'ibo': 'ig', 117 'ido': 'io', 118 'iii': 'ii', 119 'iku': 'iu', 120 'ile': 'ie', 121 'ina': 'ia', 122 'ind': 'id', 123 'ipk': 'ik', 124 'isl': 'is', 125 'ita': 'it', 126 'jav': 'jv', 127 'jpn': 'ja', 128 'kal': 'kl', 129 'kan': 'kn', 130 'kas': 'ks', 131 'kat': 'ka', 132 'kau': 'kr', 133 'kaz': 'kk', 134 'khm': 'km', 135 'kik': 'ki', 136 'kin': 'rw', 137 'kir': 'ky', 138 'kom': 'kv', 139 'kon': 'kg', 140 'kor': 'ko', 141 'kua': 'kj', 142 'kur': 'ku', 143 'lao': 'lo', 144 'lat': 'la', 145 'lav': 'lv', 146 'lim': 'li', 147 'lin': 'ln', 148 'lit': 'lt', 149 'ltz': 'lb', 150 'lub': 'lu', 151 'lug': 'lg', 152 'mah': 'mh', 153 'mal': 'ml', 154 'mar': 'mr', 155 'mkd': 'mk', 156 'mlg': 'mg', 157 'mlt': 'mt', 158 'mol': 'mo', 159 'mon': 'mn', 160 'mri': 'mi', 161 'msa': 'ms', 162 'mya': 'my', 163 'nau': 'na', 164 'nav': 'nv', 165 'nbl': 'nr', 166 'nde': 'nd', 167 'ndo': 'ng', 168 'nep': 'ne', 169 'nld': 'nl', 170 'nno': 'nn', 171 'nob': 'nb', 172 'nor': 'no', 173 'nya': 'ny', 174 'oci': 'oc', 175 'oji': 'oj', 176 'ori': 'or', 177 'orm': 'om', 178 'oss': 'os', 179 'pan': 'pa', 180 'pli': 'pi', 181 'pol': 'pl', 182 'por': 'pt', 183 'pus': 'ps', 184 'que': 'qu', 185 'roh': 'rm', 186 'ron': 'ro', 187 'run': 'rn', 188 'rus': 'ru', 189 'sag': 'sg', 190 'san': 'sa', 191 'sin': 'si', 192 'slk': 'sk', 193 'slv': 'sl', 194 'sme': 'se', 195 'smo': 'sm', 196 'sna': 'sn', 197 'snd': 'sd', 198 'som': 'so', 199 'sot': 'st', 200 'spa': 'es', 201 'sqi': 'sq', 202 'srd': 'sc', 203 'srp': 'sr', 204 'ssw': 'ss', 205 'sun': 'su', 206 'swa': 'sw', 207 'swe': 'sv', 208 'tah': 'ty', 209 'tam': 'ta', 210 'tat': 'tt', 211 'tel': 'te', 212 'tgk': 'tg', 213 'tgl': 'tl', 214 'tha': 'th', 215 'tir': 'ti', 216 'ton': 'to', 217 'tsn': 'tn', 218 'tso': 'ts', 219 'tuk': 'tk', 220 'tur': 'tr', 221 'twi': 'tw', 222 'uig': 'ug', 223 'ukr': 'uk', 224 'urd': 'ur', 225 'uzb': 'uz', 226 'ven': 've', 227 'vie': 'vi', 228 'vol': 'vo', 229 'wln': 'wa', 230 'wol': 'wo', 231 'xho': 'xh', 232 'yid': 'yi', 233 'yor': 'yo', 234 'zha': 'za', 235 'zho': 'zh', 236 'zul': 'zu', 237 } 238 239 class LanguageTag (object): 240 """A BCP 47 language tag. 241 242 Attributes: 243 subtags (List[str]): The list of subtags in this tag. 244 grandfathered (bool): Whether this tag is grandfathered. If 245 ``true``, the entire lowercased tag is the ``language`` 246 and the other subtag fields are empty. 247 language (str): The language subtag. 248 script (str): The script subtag. 249 region (str): The region subtag. 250 variant (str): The variant subtag. 251 252 Args: 253 tag (str): A BCP 47 language tag. 254 255 """ 256 def __init__ (self, tag): 257 global bcp_47 258 self.subtags = tag.lower ().split ('-') 259 self.grandfathered = tag.lower () in bcp_47.grandfathered 260 if self.grandfathered: 261 self.language = tag.lower () 262 self.script = '' 263 self.region = '' 264 self.variant = '' 265 else: 266 self.language = self.subtags[0] 267 self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags) 268 self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:]) 269 self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags) 270 271 def __str__(self): 272 return '-'.join(self.subtags) 273 274 def __repr__ (self): 275 return 'LanguageTag(%r)' % str(self) 276 277 @staticmethod 278 def _find_first (function, sequence): 279 try: 280 return next (iter (filter (function, sequence))) 281 except StopIteration: 282 return None 283 284 def is_complex (self): 285 """Return whether this tag is too complex to represent as a 286 ``LangTag`` in the generated code. 287 288 Complex tags need to be handled in 289 ``hb_ot_tags_from_complex_language``. 290 291 Returns: 292 Whether this tag is complex. 293 """ 294 return not (len (self.subtags) == 1 295 or self.grandfathered 296 and len (self.subtags[1]) != 3 297 and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language]) 298 299 def get_group (self): 300 """Return the group into which this tag should be categorized in 301 ``hb_ot_tags_from_complex_language``. 302 303 The group is the first letter of the tag, or ``'und'`` if this tag 304 should not be matched in a ``switch`` statement in the generated 305 code. 306 307 Returns: 308 This tag's group. 309 """ 310 return ('und' 311 if (self.language == 'und' 312 or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1) 313 else self.language[0]) 314 315 class OpenTypeRegistryParser (HTMLParser): 316 """A parser for the OpenType language system tag registry. 317 318 Attributes: 319 header (str): The "last updated" line of the registry. 320 names (Mapping[str, str]): A map of language system tags to the 321 names they are given in the registry. 322 ranks (DefaultDict[str, int]): A map of language system tags to 323 numbers. If a single BCP 47 tag corresponds to multiple 324 OpenType tags, the tags are ordered in increasing order by 325 rank. The rank is based on the number of BCP 47 tags 326 associated with a tag, though it may be manually modified. 327 to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of 328 OpenType language system tags to sets of BCP 47 tags. 329 from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` 330 inverted. Its values start as unsorted sets; 331 ``sort_languages`` converts them to sorted lists. 332 from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): 333 A copy of ``from_bcp_47``. It starts as ``None`` and is 334 populated at the beginning of the first call to 335 ``inherit_from_macrolanguages``. 336 337 """ 338 def __init__ (self): 339 HTMLParser.__init__ (self) 340 self.header = '' 341 self.names = {} 342 self.ranks = collections.defaultdict (int) 343 self.to_bcp_47 = collections.defaultdict (set) 344 self.from_bcp_47 = collections.defaultdict (set) 345 self.from_bcp_47_uninherited = None 346 # Whether the parser is in a <td> element 347 self._td = False 348 # Whether the parser ignores the rest of the current <td> element 349 self._disengaged = False 350 # The text of the <td> elements of the current <tr> element. 351 self._current_tr = [] 352 353 def handle_starttag (self, tag, attrs): 354 if tag == 'a': 355 if self._current_tr and not self._disengaged: 356 self._current_tr[-1] = '' 357 self._disengaged = True 358 elif tag == 'br': 359 self._disengaged = True 360 elif tag == 'meta': 361 for attr, value in attrs: 362 if attr == 'name' and value == 'updated_at': 363 self.header = self.get_starttag_text () 364 break 365 elif tag == 'td': 366 self._td = True 367 self._current_tr.append ('') 368 elif tag == 'tr': 369 self._disengaged = False 370 self._current_tr = [] 371 372 def handle_endtag (self, tag): 373 if tag == 'td': 374 self._td = False 375 self._disengaged = False 376 elif tag == 'tr' and self._current_tr: 377 expect (2 <= len (self._current_tr) <= 3) 378 name = self._current_tr[0].strip () 379 tag = self._current_tr[1].strip ("\t\n\v\f\r '") 380 rank = 0 381 if len (tag) > 4: 382 expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag) 383 name += ' (deprecated)' 384 tag = tag.split (' ')[0] 385 rank = 1 386 self.names[tag] = re.sub (' languages$', '', name) 387 if not self._current_tr[2]: 388 return 389 iso_codes = self._current_tr[2].strip () 390 self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (',')) 391 rank += 2 * len (self.to_bcp_47[tag]) 392 self.ranks[tag] = rank 393 394 def handle_data (self, data): 395 if self._td and not self._disengaged: 396 self._current_tr[-1] += data 397 398 def handle_charref (self, name): 399 self.handle_data (html.unescape ('&#%s;' % name)) 400 401 def handle_entityref (self, name): 402 self.handle_data (html.unescape ('&%s;' % name)) 403 404 def parse (self, filename): 405 """Parse the OpenType language system tag registry. 406 407 Args: 408 filename (str): The file name of the registry. 409 """ 410 with open (filename, encoding='utf-8') as f: 411 self.feed (f.read ()) 412 expect (self.header) 413 for tag, iso_codes in self.to_bcp_47.items (): 414 for iso_code in iso_codes: 415 self.from_bcp_47[iso_code].add (tag) 416 417 def add_language (self, bcp_47_tag, ot_tag): 418 """Add a language as if it were in the registry. 419 420 Args: 421 bcp_47_tag (str): A BCP 47 tag. If the tag is more than just 422 a language subtag, and if the language subtag is a 423 macrolanguage, then new languages are added corresponding 424 to the macrolanguages' individual languages with the 425 remainder of the tag appended. 426 ot_tag (str): An OpenType language system tag. 427 """ 428 global bcp_47 429 self.to_bcp_47[ot_tag].add (bcp_47_tag) 430 self.from_bcp_47[bcp_47_tag].add (ot_tag) 431 if bcp_47_tag.lower () not in bcp_47.grandfathered: 432 try: 433 [macrolanguage, suffix] = bcp_47_tag.split ('-', 1) 434 if macrolanguage in bcp_47.macrolanguages: 435 s = set () 436 for language in bcp_47.macrolanguages[macrolanguage]: 437 if language.lower () not in bcp_47.grandfathered: 438 s.add ('%s-%s' % (language, suffix)) 439 bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s 440 except ValueError: 441 pass 442 443 @staticmethod 444 def _remove_language (tag_1, dict_1, dict_2): 445 for tag_2 in dict_1.pop (tag_1): 446 dict_2[tag_2].remove (tag_1) 447 if not dict_2[tag_2]: 448 del dict_2[tag_2] 449 450 def remove_language_ot (self, ot_tag): 451 """Remove an OpenType tag from the registry. 452 453 Args: 454 ot_tag (str): An OpenType tag. 455 """ 456 self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47) 457 458 def remove_language_bcp_47 (self, bcp_47_tag): 459 """Remove a BCP 47 tag from the registry. 460 461 Args: 462 bcp_47_tag (str): A BCP 47 tag. 463 """ 464 self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47) 465 466 def inherit_from_macrolanguages (self): 467 """Copy mappings from macrolanguages to individual languages. 468 469 If a BCP 47 tag for an individual mapping has no OpenType 470 mapping but its macrolanguage does, the mapping is copied to 471 the individual language. For example, als (Tosk Albanian) has no 472 explicit mapping, so it inherits from sq (Albanian) the mapping 473 to SQI. 474 475 However, if an OpenType tag maps to a BCP 47 macrolanguage and 476 some but not all of its individual languages, the mapping is not 477 inherited from the macrolanguage to the missing individual 478 languages. For example, INUK (Nunavik Inuktitut) is mapped to 479 ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to 480 ikt (Inuinnaqtun, which is an individual language of iu), so 481 this method does not add a mapping from ikt to INUK. 482 483 If a BCP 47 tag for a macrolanguage has no OpenType mapping but 484 some of its individual languages do, their mappings are copied 485 to the macrolanguage. 486 """ 487 global bcp_47 488 first_time = self.from_bcp_47_uninherited is None 489 if first_time: 490 self.from_bcp_47_uninherited = dict (self.from_bcp_47) 491 for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): 492 ot_macrolanguages = { 493 ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) 494 } 495 blocked_ot_macrolanguages = set () 496 if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): 497 for ot_macrolanguage in ot_macrolanguages: 498 round_trip_macrolanguages = { 499 l for l in self.to_bcp_47[ot_macrolanguage] 500 if 'retired code' not in bcp_47.scopes.get (l, '') 501 } 502 round_trip_languages = { 503 l for l in languages 504 if 'retired code' not in bcp_47.scopes.get (l, '') 505 } 506 intersection = round_trip_macrolanguages & round_trip_languages 507 if intersection and intersection != round_trip_languages: 508 blocked_ot_macrolanguages.add (ot_macrolanguage) 509 if ot_macrolanguages: 510 for ot_macrolanguage in ot_macrolanguages: 511 if ot_macrolanguage not in blocked_ot_macrolanguages: 512 for language in languages: 513 self.add_language (language, ot_macrolanguage) 514 if not blocked_ot_macrolanguages: 515 self.ranks[ot_macrolanguage] += 1 516 elif first_time: 517 for language in languages: 518 if language in self.from_bcp_47_uninherited: 519 ot_macrolanguages |= self.from_bcp_47_uninherited[language] 520 else: 521 ot_macrolanguages.clear () 522 if not ot_macrolanguages: 523 break 524 for ot_macrolanguage in ot_macrolanguages: 525 self.add_language (macrolanguage, ot_macrolanguage) 526 527 def sort_languages (self): 528 """Sort the values of ``from_bcp_47`` in ascending rank order.""" 529 for language, tags in self.from_bcp_47.items (): 530 self.from_bcp_47[language] = sorted (tags, 531 key=lambda t: (self.ranks[t] + rank_delta (language, t), t)) 532 533 ot = OpenTypeRegistryParser () 534 535 class BCP47Parser (object): 536 """A parser for the BCP 47 subtag registry. 537 538 Attributes: 539 header (str): The "File-Date" line of the registry. 540 names (Mapping[str, str]): A map of subtags to the names they 541 are given in the registry. Each value is a 542 ``'\\n'``-separated list of names. 543 scopes (Mapping[str, str]): A map of language subtags to strings 544 suffixed to language names, including suffixes to explain 545 language scopes. 546 macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of 547 language subtags to the sets of language subtags which 548 inherit from them. See 549 ``OpenTypeRegistryParser.inherit_from_macrolanguages``. 550 prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant 551 subtags to their prefixes. 552 grandfathered (AbstractSet[str]): The set of grandfathered tags, 553 normalized to lowercase. 554 555 """ 556 def __init__ (self): 557 self.header = '' 558 self.names = {} 559 self.scopes = {} 560 self.macrolanguages = collections.defaultdict (set) 561 self.prefixes = collections.defaultdict (set) 562 self.grandfathered = set () 563 564 def parse (self, filename): 565 """Parse the BCP 47 subtag registry. 566 567 Args: 568 filename (str): The file name of the registry. 569 """ 570 with open (filename, encoding='utf-8') as f: 571 subtag_type = None 572 subtag = None 573 deprecated = False 574 has_preferred_value = False 575 line_buffer = '' 576 for line in itertools.chain (f, ['']): 577 line = line.rstrip () 578 if line.startswith (' '): 579 line_buffer += line[1:] 580 continue 581 line, line_buffer = line_buffer, line 582 if line.startswith ('Type: '): 583 subtag_type = line.split (' ')[1] 584 deprecated = False 585 has_preferred_value = False 586 elif line.startswith ('Subtag: ') or line.startswith ('Tag: '): 587 subtag = line.split (' ')[1] 588 if subtag_type == 'grandfathered': 589 self.grandfathered.add (subtag.lower ()) 590 elif line.startswith ('Description: '): 591 description = line.split (' ', 1)[1].replace (' (individual language)', '') 592 description = re.sub (r' (\(family\)|\((individual |macro)language\)|languages)$', '', 593 description) 594 if subtag in self.names: 595 self.names[subtag] += '\n' + description 596 else: 597 self.names[subtag] = description 598 elif subtag_type == 'language' or subtag_type == 'grandfathered': 599 if line.startswith ('Scope: '): 600 scope = line.split (' ')[1] 601 if scope == 'macrolanguage': 602 scope = ' [macrolanguage]' 603 elif scope == 'collection': 604 scope = ' [collection]' 605 else: 606 continue 607 self.scopes[subtag] = scope 608 elif line.startswith ('Deprecated: '): 609 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 610 deprecated = True 611 elif deprecated and line.startswith ('Comments: see '): 612 # If a subtag is split into multiple replacement subtags, 613 # it essentially represents a macrolanguage. 614 for language in line.replace (',', '').split (' ')[2:]: 615 self._add_macrolanguage (subtag, language) 616 elif line.startswith ('Preferred-Value: '): 617 # If a subtag is deprecated in favor of a single replacement subtag, 618 # it is either a dialect or synonym of the preferred subtag. Either 619 # way, it is close enough to the truth to consider the replacement 620 # the macrolanguage of the deprecated language. 621 has_preferred_value = True 622 macrolanguage = line.split (' ')[1] 623 self._add_macrolanguage (macrolanguage, subtag) 624 elif not has_preferred_value and line.startswith ('Macrolanguage: '): 625 self._add_macrolanguage (line.split (' ')[1], subtag) 626 elif subtag_type == 'variant': 627 if line.startswith ('Deprecated: '): 628 self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '') 629 elif line.startswith ('Prefix: '): 630 self.prefixes[subtag].add (line.split (' ')[1]) 631 elif line.startswith ('File-Date: '): 632 self.header = line 633 expect (self.header) 634 635 def _add_macrolanguage (self, macrolanguage, language): 636 global ot 637 if language not in ot.from_bcp_47: 638 for l in self.macrolanguages.get (language, set ()): 639 self._add_macrolanguage (macrolanguage, l) 640 if macrolanguage not in ot.from_bcp_47: 641 for ls in list (self.macrolanguages.values ()): 642 if macrolanguage in ls: 643 ls.add (language) 644 return 645 self.macrolanguages[macrolanguage].add (language) 646 647 def remove_extra_macrolanguages (self): 648 """Make every language have at most one macrolanguage.""" 649 inverted = collections.defaultdict (list) 650 for macrolanguage, languages in self.macrolanguages.items (): 651 for language in languages: 652 inverted[language].append (macrolanguage) 653 for language, macrolanguages in inverted.items (): 654 if len (macrolanguages) > 1: 655 macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml])) 656 biggest_macrolanguage = macrolanguages.pop () 657 for macrolanguage in macrolanguages: 658 self._add_macrolanguage (biggest_macrolanguage, macrolanguage) 659 660 def _get_name_piece (self, subtag): 661 """Return the first name of a subtag plus its scope suffix. 662 663 Args: 664 subtag (str): A BCP 47 subtag. 665 666 Returns: 667 The name form of ``subtag``. 668 """ 669 return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '') 670 671 def get_name (self, lt): 672 """Return the names of the subtags in a language tag. 673 674 Args: 675 lt (LanguageTag): A BCP 47 language tag. 676 677 Returns: 678 The name form of ``lt``. 679 """ 680 name = self._get_name_piece (lt.language) 681 if lt.script: 682 name += '; ' + self._get_name_piece (lt.script.title ()) 683 if lt.region: 684 name += '; ' + self._get_name_piece (lt.region.upper ()) 685 if lt.variant: 686 name += '; ' + self._get_name_piece (lt.variant) 687 return name 688 689 bcp_47 = BCP47Parser () 690 691 ot.parse (sys.argv[1]) 692 bcp_47.parse (sys.argv[2]) 693 694 ot.add_language ('ary', 'MOR') 695 696 ot.add_language ('ath', 'ATH') 697 698 ot.add_language ('bai', 'BML') 699 700 ot.ranks['BAL'] = ot.ranks['KAR'] + 1 701 702 ot.add_language ('ber', 'BBR') 703 704 ot.remove_language_ot ('PGR') 705 ot.add_language ('el-polyton', 'PGR') 706 707 bcp_47.names['flm'] = 'Falam Chin' 708 bcp_47.scopes['flm'] = ' (retired code)' 709 bcp_47.macrolanguages['flm'] = {'cfm'} 710 711 ot.ranks['FNE'] = ot.ranks['TNE'] + 1 712 713 ot.add_language ('und-fonipa', 'IPPH') 714 715 ot.add_language ('und-fonnapa', 'APPH') 716 717 ot.add_language ('ga-Latg', 'IRT') 718 719 ot.add_language ('hy-arevmda', 'HYE') 720 721 ot.remove_language_ot ('KGE') 722 ot.add_language ('und-Geok', 'KGE') 723 724 ot.add_language ('kht', 'KHN') 725 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)' 726 ot.ranks['KHN'] = ot.ranks['KHT'] + 1 727 728 ot.ranks['LCR'] = ot.ranks['MCR'] + 1 729 730 ot.names['MAL'] = 'Malayalam Traditional' 731 ot.ranks['MLR'] += 1 732 733 bcp_47.names['mhv'] = 'Arakanese' 734 bcp_47.scopes['mhv'] = ' (retired code)' 735 736 ot.add_language ('mnw-TH', 'MONT') 737 738 ot.add_language ('no', 'NOR') 739 740 ot.add_language ('oc-provenc', 'PRO') 741 742 ot.remove_language_ot ('QUZ') 743 ot.add_language ('qu', 'QUZ') 744 ot.add_language ('qub', 'QWH') 745 ot.add_language ('qud', 'QVI') 746 ot.add_language ('qug', 'QVI') 747 ot.add_language ('qul', 'QUH') 748 ot.add_language ('qup', 'QVI') 749 ot.add_language ('qur', 'QWH') 750 ot.add_language ('qus', 'QUH') 751 ot.add_language ('quw', 'QVI') 752 ot.add_language ('qux', 'QWH') 753 ot.add_language ('qva', 'QWH') 754 ot.add_language ('qvh', 'QWH') 755 ot.add_language ('qvj', 'QVI') 756 ot.add_language ('qvl', 'QWH') 757 ot.add_language ('qvm', 'QWH') 758 ot.add_language ('qvn', 'QWH') 759 ot.add_language ('qvo', 'QVI') 760 ot.add_language ('qvp', 'QWH') 761 ot.add_language ('qvw', 'QWH') 762 ot.add_language ('qvz', 'QVI') 763 ot.add_language ('qwa', 'QWH') 764 ot.add_language ('qws', 'QWH') 765 ot.add_language ('qxa', 'QWH') 766 ot.add_language ('qxc', 'QWH') 767 ot.add_language ('qxh', 'QWH') 768 ot.add_language ('qxl', 'QVI') 769 ot.add_language ('qxn', 'QWH') 770 ot.add_language ('qxo', 'QWH') 771 ot.add_language ('qxr', 'QVI') 772 ot.add_language ('qxt', 'QWH') 773 ot.add_language ('qxw', 'QWH') 774 775 bcp_47.macrolanguages['ro-MD'].add ('mo') 776 777 ot.remove_language_ot ('SYRE') 778 ot.remove_language_ot ('SYRJ') 779 ot.remove_language_ot ('SYRN') 780 ot.add_language ('und-Syre', 'SYRE') 781 ot.add_language ('und-Syrj', 'SYRJ') 782 ot.add_language ('und-Syrn', 'SYRN') 783 784 bcp_47.names['xst'] = "Silt'e" 785 bcp_47.scopes['xst'] = ' (retired code)' 786 bcp_47.macrolanguages['xst'] = {'stv', 'wle'} 787 788 ot.add_language ('xwo', 'TOD') 789 790 ot.remove_language_ot ('ZHH') 791 ot.remove_language_ot ('ZHP') 792 ot.remove_language_ot ('ZHT') 793 ot.remove_language_ot ('ZHTM') 794 bcp_47.macrolanguages['zh'].remove ('lzh') 795 bcp_47.macrolanguages['zh'].remove ('yue') 796 ot.add_language ('zh-Hant-MO', 'ZHH') 797 ot.add_language ('zh-Hant-MO', 'ZHTM') 798 ot.add_language ('zh-Hant-HK', 'ZHH') 799 ot.add_language ('zh-Hans', 'ZHS') 800 ot.add_language ('zh-Hant', 'ZHT') 801 ot.add_language ('zh-HK', 'ZHH') 802 ot.add_language ('zh-MO', 'ZHH') 803 ot.add_language ('zh-MO', 'ZHTM') 804 ot.add_language ('zh-TW', 'ZHT') 805 ot.add_language ('lzh', 'ZHT') 806 ot.add_language ('lzh-Hans', 'ZHS') 807 ot.add_language ('yue', 'ZHH') 808 ot.add_language ('yue-Hans', 'ZHS') 809 810 def rank_delta (bcp_47, ot): 811 """Return a delta to apply to a BCP 47 tag's rank. 812 813 Most OpenType tags have a constant rank, but a few have ranks that 814 depend on the BCP 47 tag. 815 816 Args: 817 bcp_47 (str): A BCP 47 tag. 818 ot (str): An OpenType tag to. 819 820 Returns: 821 A number to add to ``ot``'s rank when sorting ``bcp_47``'s 822 OpenType equivalents. 823 """ 824 if bcp_47 == 'ak' and ot == 'AKA': 825 return -1 826 if bcp_47 == 'tw' and ot == 'TWI': 827 return -1 828 return 0 829 830 disambiguation = { 831 'ALT': 'alt', 832 'ARK': 'rki', 833 'ATH': 'ath', 834 'BHI': 'bhb', 835 'BLN': 'bjt', 836 'BTI': 'beb', 837 'CCHN': 'cco', 838 'CMR': 'swb', 839 'CPP': 'crp', 840 'CRR': 'crx', 841 'DUJ': 'dwu', 842 'ECR': 'crj', 843 'HAL': 'cfm', 844 'HND': 'hnd', 845 'HYE': 'hyw', 846 'KIS': 'kqs', 847 'KUI': 'uki', 848 'LRC': 'bqi', 849 'NDB': 'nd', 850 'NIS': 'njz', 851 'PLG': 'pce', 852 'PRO': 'pro', 853 'QIN': 'bgr', 854 'QUH': 'quh', 855 'QVI': 'qvi', 856 'QWH': 'qwh', 857 'SIG': 'stv', 858 'SRB': 'sr', 859 'SXT': 'xnj', 860 'ZHH': 'zh-HK', 861 'ZHS': 'zh-Hans', 862 'ZHT': 'zh-Hant', 863 'ZHTM': 'zh-MO', 864 } 865 866 ot.inherit_from_macrolanguages () 867 bcp_47.remove_extra_macrolanguages () 868 ot.inherit_from_macrolanguages () 869 ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/' 870 ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1 871 for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names): 872 possible_bcp_47_tag = tricky_ot_tag.lower () 873 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]: 874 ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM) 875 bcp_47.macrolanguages[possible_bcp_47_tag] = set () 876 ot.sort_languages () 877 878 print ('/* == Start of generated table == */') 879 print ('/*') 880 print (' * The following table is generated by running:') 881 print (' *') 882 print (' * %s languagetags language-subtag-registry' % sys.argv[0]) 883 print (' *') 884 print (' * on files with these headers:') 885 print (' *') 886 print (' * %s' % ot.header.strip ()) 887 print (' * %s' % bcp_47.header) 888 print (' */') 889 print () 890 print ('#ifndef HB_OT_TAG_TABLE_HH') 891 print ('#define HB_OT_TAG_TABLE_HH') 892 print () 893 894 def hb_tag (tag): 895 """Convert a tag to ``HB_TAG`` form. 896 897 Args: 898 tag (str): An OpenType tag. 899 900 Returns: 901 A snippet of C++ representing ``tag``. 902 """ 903 if tag == DEFAULT_LANGUAGE_SYSTEM: 904 return 'HB_TAG_NONE\t ' 905 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4]) 906 907 def get_variant_set (name): 908 """Return a set of variant language names from a name. 909 910 Args: 911 name (str): A list of language names from the BCP 47 registry, 912 joined on ``'\\n'``. 913 914 Returns: 915 A set of normalized language names. 916 """ 917 return set (unicodedata.normalize ( 918 'NFD', 919 n.replace ('\u02BC', "'").replace ('\u2019', "'"), 920 ) 921 .encode ('ASCII', 'ignore') 922 .strip () 923 for n in re.split ('[\n(),]', name) if n) 924 925 def language_name_intersection (a, b): 926 """Return the names in common between two language names. 927 928 Args: 929 a (str): A list of language names from the BCP 47 registry, 930 joined on ``'\\n'``. 931 b (str): A list of language names from the BCP 47 registry, 932 joined on ``'\\n'``. 933 934 Returns: 935 The normalized language names shared by ``a`` and ``b``. 936 """ 937 return get_variant_set (a).intersection (get_variant_set (b)) 938 939 def get_matching_language_name (intersection, candidates): 940 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c)))) 941 942 def same_tag (bcp_47_tag, ot_tags): 943 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () 944 945 for language_len in (2, 3): 946 if language_len == 3: 947 print ('#ifndef HB_NO_LANGUAGE_LONG') 948 print ('static const LangTag ot_languages%d[] = {' % language_len) 949 for language, tags in sorted (ot.from_bcp_47.items ()): 950 if language == '' or '-' in language: 951 continue 952 if len(language) != language_len: continue 953 commented_out = same_tag (language, tags) 954 for i, tag in enumerate (tags, start=1): 955 print ('%s{%s,\t%s},' % ('/*' if commented_out else ' ', hb_tag (language), hb_tag (tag)), end='') 956 if commented_out: 957 print ('*/', end='') 958 print ('\t/* ', end='') 959 bcp_47_name = bcp_47.names.get (language, '') 960 bcp_47_name_candidates = bcp_47_name.split ('\n') 961 ot_name = ot.names[tag] 962 scope = bcp_47.scopes.get (language, '') 963 if tag == DEFAULT_LANGUAGE_SYSTEM: 964 write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}') 965 else: 966 intersection = language_name_intersection (bcp_47_name, ot_name) 967 if not intersection: 968 write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name)) 969 else: 970 name = get_matching_language_name (intersection, bcp_47_name_candidates) 971 bcp_47.names[language] = name 972 write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) 973 print (' */') 974 print ('};') 975 if language_len == 3: 976 print ('#endif') 977 print () 978 979 print ('/**') 980 print (' * hb_ot_tags_from_complex_language:') 981 print (' * @lang_str: a BCP 47 language tag to convert.') 982 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for') 983 print (' * conversion.') 984 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of') 985 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.') 986 print (' * @tags: array of size at least @language_count to store the language tag') 987 print (' * results') 988 print (' *') 989 print (' * Converts a multi-subtag BCP 47 language tag to language tags.') 990 print (' *') 991 print (' * Return value: Whether any language systems were retrieved.') 992 print (' **/') 993 print ('static inline bool') 994 print ('hb_ot_tags_from_complex_language (const char *lang_str,') 995 print ('\t\t\t\t const char *limit,') 996 print ('\t\t\t\t unsigned int *count /* IN/OUT */,') 997 print ('\t\t\t\t hb_tag_t *tags /* OUT */)') 998 print ('{') 999 1000 def print_subtag_matches (subtag, string, new_line): 1001 if subtag: 1002 if new_line: 1003 print () 1004 print ('\t&& ', end='') 1005 print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='') 1006 1007 complex_tags = collections.defaultdict (list) 1008 for initial, group in itertools.groupby ((lt_tags for lt_tags in [ 1009 (LanguageTag (language), tags) 1010 for language, tags in sorted (ot.from_bcp_47.items (), 1011 key=lambda i: (-len (i[0]), i[0])) 1012 ] if lt_tags[0].is_complex ()), 1013 key=lambda lt_tags: lt_tags[0].get_group ()): 1014 complex_tags[initial] += group 1015 1016 # Calculate the min length of the subtags outside the switch 1017 min_subtag_len = 100 1018 for initial, items in sorted (complex_tags.items ()): 1019 if initial != 'und': 1020 continue 1021 for lt, tags in items: 1022 if not tags: 1023 continue 1024 subtag_len = 0 1025 subtag_len += 1 + len (lt.script) if lt.script is not None else 0 1026 subtag_len += 1 + len (lt.region) if lt.region is not None else 0 1027 subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0 1028 min_subtag_len = min(subtag_len, min_subtag_len) 1029 1030 print (' if (limit - lang_str >= %d)' % (min_subtag_len + 2)) 1031 print (' {') 1032 print (" const char *p = strchr (lang_str, '-');") 1033 print (" if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len) 1034 for initial, items in sorted (complex_tags.items ()): 1035 if initial != 'und': 1036 continue 1037 for lt, tags in items: 1038 if not tags: 1039 continue 1040 if lt.variant in bcp_47.prefixes: 1041 expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language, 1042 '%s is not a valid prefix of %s' % (lt.language, lt.variant)) 1043 print (' if (', end='') 1044 print_subtag_matches (lt.script, 'p', False) 1045 print_subtag_matches (lt.region, 'p', False) 1046 print_subtag_matches (lt.variant, 'p', False) 1047 print (')') 1048 print (' {') 1049 write (' /* %s */' % bcp_47.get_name (lt)) 1050 print () 1051 if len (tags) == 1: 1052 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1053 print () 1054 print (' *count = 1;') 1055 else: 1056 print (' hb_tag_t possible_tags[] = {') 1057 for tag in tags: 1058 write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1059 print () 1060 print (' };') 1061 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1062 print ('\ttags[i] = possible_tags[i];') 1063 print (' *count = i;') 1064 print (' return true;') 1065 print (' }') 1066 print (' }') 1067 print ('out:') 1068 1069 print (' switch (lang_str[0])') 1070 print (' {') 1071 for initial, items in sorted (complex_tags.items ()): 1072 if initial == 'und': 1073 continue 1074 print (" case '%s':" % initial) 1075 for lt, tags in items: 1076 if not tags: 1077 continue 1078 print (' if (', end='') 1079 script = lt.script 1080 region = lt.region 1081 if lt.grandfathered: 1082 print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='') 1083 else: 1084 string_literal = lt.language[1:] + '-' 1085 if script: 1086 string_literal += script 1087 script = None 1088 if region: 1089 string_literal += '-' + region 1090 region = None 1091 if string_literal[-1] == '-': 1092 print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='') 1093 else: 1094 print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='') 1095 print_subtag_matches (script, 'lang_str', True) 1096 print_subtag_matches (region, 'lang_str', True) 1097 print_subtag_matches (lt.variant, 'lang_str', True) 1098 print (')') 1099 print (' {') 1100 write (' /* %s */' % bcp_47.get_name (lt)) 1101 print () 1102 if len (tags) == 1: 1103 write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]])) 1104 print () 1105 print (' *count = 1;') 1106 else: 1107 print (' unsigned int i;') 1108 print (' hb_tag_t possible_tags[] = {') 1109 for tag in tags: 1110 write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag])) 1111 print () 1112 print (' };') 1113 print (' for (i = 0; i < %s && i < *count; i++)' % len (tags)) 1114 print ('\ttags[i] = possible_tags[i];') 1115 print (' *count = i;') 1116 print (' return true;') 1117 print (' }') 1118 print (' break;') 1119 1120 print (' }') 1121 print (' return false;') 1122 print ('}') 1123 print () 1124 print ('/**') 1125 print (' * hb_ot_ambiguous_tag_to_language') 1126 print (' * @tag: A language tag.') 1127 print (' *') 1128 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to') 1129 print (' * many language tags) and the best tag is not the first (sorted alphabetically,') 1130 print (' * with two-letter tags having priority over all three-letter tags), or if the') 1131 print (' * best tag consists of multiple subtags, or if the best tag does not appear in') 1132 print (' * #ot_languages2 or #ot_languages3.') 1133 print (' *') 1134 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,') 1135 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.') 1136 print (' **/') 1137 print ('static inline hb_language_t') 1138 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)') 1139 print ('{') 1140 print (' switch (tag)') 1141 print (' {') 1142 1143 def verify_disambiguation_dict (): 1144 """Verify and normalize ``disambiguation``. 1145 1146 ``disambiguation`` is a map of ambiguous OpenType language system 1147 tags to the particular BCP 47 tags they correspond to. This function 1148 checks that all its keys really are ambiguous and that each key's 1149 value is valid for that key. It checks that no ambiguous tag is 1150 missing, except when it can figure out which BCP 47 tag is the best 1151 by itself. 1152 1153 It modifies ``disambiguation`` to remove keys whose values are the 1154 same as those that the fallback would return anyway, and to add 1155 ambiguous keys whose disambiguations it determined automatically. 1156 1157 Raises: 1158 AssertionError: Verification failed. 1159 """ 1160 global bcp_47 1161 global disambiguation 1162 global ot 1163 for ot_tag, bcp_47_tags in ot.to_bcp_47.items (): 1164 if ot_tag == DEFAULT_LANGUAGE_SYSTEM: 1165 primary_tags = [] 1166 else: 1167 primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag) 1168 if len (primary_tags) == 1: 1169 expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag) 1170 if '-' in primary_tags[0]: 1171 disambiguation[ot_tag] = primary_tags[0] 1172 else: 1173 first_tag = sorted ((t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t)), 1174 key=lambda t: (len (t), t))[0] 1175 if primary_tags[0] != first_tag: 1176 disambiguation[ot_tag] = primary_tags[0] 1177 elif len (primary_tags) == 0: 1178 expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) 1179 else: 1180 original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] 1181 if len (original_languages) == 1: 1182 macrolanguages = original_languages 1183 else: 1184 macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] 1185 if len (macrolanguages) != 1: 1186 macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]') 1187 if len (macrolanguages) != 1: 1188 macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, '')) 1189 if len (macrolanguages) != 1: 1190 macrolanguages = list (t for t in primary_tags if t.lower () == ISO_639_3_TO_1.get (ot_tag.lower (), ot_tag.lower ())) 1191 if len (macrolanguages) != 1: 1192 macrolanguages = list (t for t in primary_tags if '-' not in t) 1193 if len (macrolanguages) != 1: 1194 expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, sorted (primary_tags))) 1195 expect (disambiguation[ot_tag] in bcp_47_tags, 1196 '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag)) 1197 elif ot_tag not in disambiguation: 1198 disambiguation[ot_tag] = macrolanguages[0] 1199 if '-' not in disambiguation[ot_tag]: 1200 different_bcp_47_tags = sorted ((t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))), 1201 key=lambda t: (len (t), t)) 1202 if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0]: 1203 del disambiguation[ot_tag] 1204 for ot_tag in disambiguation.keys (): 1205 expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag) 1206 1207 verify_disambiguation_dict () 1208 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()): 1209 write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag])) 1210 print () 1211 write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag)))) 1212 print () 1213 1214 print (' default:') 1215 print (' return HB_LANGUAGE_INVALID;') 1216 print (' }') 1217 print ('}') 1218 1219 print () 1220 print ('#endif /* HB_OT_TAG_TABLE_HH */') 1221 print () 1222 print ('/* == End of generated table == */')