tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-tag-table.py (37607B)


      1 #!/usr/bin/env python3
      2 
      3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
      4 versa.
      5 
      6 It creates a ``const LangTag[]``, matching the tags from the OpenType
      7 languages system tag list to the language subtags of the BCP 47 language
      8 subtag registry, with some manual adjustments. The mappings are
      9 supplemented with macrolanguages' sublanguages and retired codes'
     10 replacements, according to BCP 47 and some manual additions where BCP 47
     11 omits a retired code entirely.
     12 
     13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
     14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
     15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
     16 multiple BCP 47 tags) are listed here, except when the alphabetically
     17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
     18 case, the fallback behavior will choose the right tag anyway.
     19 
     20 usage: ./gen-tag-table.py languagetags language-subtag-registry
     21 
     22 Input files:
     23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
     24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
     25 """
     26 
     27 import collections
     28 import html
     29 from html.parser import HTMLParser
     30 import itertools
     31 import re
     32 import sys
     33 import unicodedata
     34 
     35 if len (sys.argv) != 3:
     36 sys.exit (__doc__)
     37 
     38 def expect (condition, message=None):
     39 if not condition:
     40 	if message is None:
     41 		raise AssertionError
     42 	raise AssertionError (message)
     43 
     44 def write (s):
     45 sys.stdout.flush ()
     46 sys.stdout.buffer.write (s.encode ('utf-8'))
     47 
     48 DEFAULT_LANGUAGE_SYSTEM = ''
     49 
     50 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
     51 ISO_639_3_TO_1 = {
     52 'aar': 'aa',
     53 'abk': 'ab',
     54 'afr': 'af',
     55 'aka': 'ak',
     56 'amh': 'am',
     57 'ara': 'ar',
     58 'arg': 'an',
     59 'asm': 'as',
     60 'ava': 'av',
     61 'ave': 'ae',
     62 'aym': 'ay',
     63 'aze': 'az',
     64 'bak': 'ba',
     65 'bam': 'bm',
     66 'bel': 'be',
     67 'ben': 'bn',
     68 'bis': 'bi',
     69 'bod': 'bo',
     70 'bos': 'bs',
     71 'bre': 'br',
     72 'bul': 'bg',
     73 'cat': 'ca',
     74 'ces': 'cs',
     75 'cha': 'ch',
     76 'che': 'ce',
     77 'chu': 'cu',
     78 'chv': 'cv',
     79 'cor': 'kw',
     80 'cos': 'co',
     81 'cre': 'cr',
     82 'cym': 'cy',
     83 'dan': 'da',
     84 'deu': 'de',
     85 'div': 'dv',
     86 'dzo': 'dz',
     87 'ell': 'el',
     88 'eng': 'en',
     89 'epo': 'eo',
     90 'est': 'et',
     91 'eus': 'eu',
     92 'ewe': 'ee',
     93 'fao': 'fo',
     94 'fas': 'fa',
     95 'fij': 'fj',
     96 'fin': 'fi',
     97 'fra': 'fr',
     98 'fry': 'fy',
     99 'ful': 'ff',
    100 'gla': 'gd',
    101 'gle': 'ga',
    102 'glg': 'gl',
    103 'glv': 'gv',
    104 'grn': 'gn',
    105 'guj': 'gu',
    106 'hat': 'ht',
    107 'hau': 'ha',
    108 'hbs': 'sh',
    109 'heb': 'he',
    110 'her': 'hz',
    111 'hin': 'hi',
    112 'hmo': 'ho',
    113 'hrv': 'hr',
    114 'hun': 'hu',
    115 'hye': 'hy',
    116 'ibo': 'ig',
    117 'ido': 'io',
    118 'iii': 'ii',
    119 'iku': 'iu',
    120 'ile': 'ie',
    121 'ina': 'ia',
    122 'ind': 'id',
    123 'ipk': 'ik',
    124 'isl': 'is',
    125 'ita': 'it',
    126 'jav': 'jv',
    127 'jpn': 'ja',
    128 'kal': 'kl',
    129 'kan': 'kn',
    130 'kas': 'ks',
    131 'kat': 'ka',
    132 'kau': 'kr',
    133 'kaz': 'kk',
    134 'khm': 'km',
    135 'kik': 'ki',
    136 'kin': 'rw',
    137 'kir': 'ky',
    138 'kom': 'kv',
    139 'kon': 'kg',
    140 'kor': 'ko',
    141 'kua': 'kj',
    142 'kur': 'ku',
    143 'lao': 'lo',
    144 'lat': 'la',
    145 'lav': 'lv',
    146 'lim': 'li',
    147 'lin': 'ln',
    148 'lit': 'lt',
    149 'ltz': 'lb',
    150 'lub': 'lu',
    151 'lug': 'lg',
    152 'mah': 'mh',
    153 'mal': 'ml',
    154 'mar': 'mr',
    155 'mkd': 'mk',
    156 'mlg': 'mg',
    157 'mlt': 'mt',
    158 'mol': 'mo',
    159 'mon': 'mn',
    160 'mri': 'mi',
    161 'msa': 'ms',
    162 'mya': 'my',
    163 'nau': 'na',
    164 'nav': 'nv',
    165 'nbl': 'nr',
    166 'nde': 'nd',
    167 'ndo': 'ng',
    168 'nep': 'ne',
    169 'nld': 'nl',
    170 'nno': 'nn',
    171 'nob': 'nb',
    172 'nor': 'no',
    173 'nya': 'ny',
    174 'oci': 'oc',
    175 'oji': 'oj',
    176 'ori': 'or',
    177 'orm': 'om',
    178 'oss': 'os',
    179 'pan': 'pa',
    180 'pli': 'pi',
    181 'pol': 'pl',
    182 'por': 'pt',
    183 'pus': 'ps',
    184 'que': 'qu',
    185 'roh': 'rm',
    186 'ron': 'ro',
    187 'run': 'rn',
    188 'rus': 'ru',
    189 'sag': 'sg',
    190 'san': 'sa',
    191 'sin': 'si',
    192 'slk': 'sk',
    193 'slv': 'sl',
    194 'sme': 'se',
    195 'smo': 'sm',
    196 'sna': 'sn',
    197 'snd': 'sd',
    198 'som': 'so',
    199 'sot': 'st',
    200 'spa': 'es',
    201 'sqi': 'sq',
    202 'srd': 'sc',
    203 'srp': 'sr',
    204 'ssw': 'ss',
    205 'sun': 'su',
    206 'swa': 'sw',
    207 'swe': 'sv',
    208 'tah': 'ty',
    209 'tam': 'ta',
    210 'tat': 'tt',
    211 'tel': 'te',
    212 'tgk': 'tg',
    213 'tgl': 'tl',
    214 'tha': 'th',
    215 'tir': 'ti',
    216 'ton': 'to',
    217 'tsn': 'tn',
    218 'tso': 'ts',
    219 'tuk': 'tk',
    220 'tur': 'tr',
    221 'twi': 'tw',
    222 'uig': 'ug',
    223 'ukr': 'uk',
    224 'urd': 'ur',
    225 'uzb': 'uz',
    226 'ven': 've',
    227 'vie': 'vi',
    228 'vol': 'vo',
    229 'wln': 'wa',
    230 'wol': 'wo',
    231 'xho': 'xh',
    232 'yid': 'yi',
    233 'yor': 'yo',
    234 'zha': 'za',
    235 'zho': 'zh',
    236 'zul': 'zu',
    237 }
    238 
    239 class LanguageTag (object):
    240 """A BCP 47 language tag.
    241 
    242 Attributes:
    243 	subtags (List[str]): The list of subtags in this tag.
    244 	grandfathered (bool): Whether this tag is grandfathered. If
    245 		``true``, the entire lowercased tag is the ``language``
    246 		and the other subtag fields are empty.
    247 	language (str): The language subtag.
    248 	script (str): The script subtag.
    249 	region (str): The region subtag.
    250 	variant (str): The variant subtag.
    251 
    252 Args:
    253 	tag (str): A BCP 47 language tag.
    254 
    255 """
    256 def __init__ (self, tag):
    257 	global bcp_47
    258 	self.subtags = tag.lower ().split ('-')
    259 	self.grandfathered = tag.lower () in bcp_47.grandfathered
    260 	if self.grandfathered:
    261 		self.language = tag.lower ()
    262 		self.script = ''
    263 		self.region = ''
    264 		self.variant = ''
    265 	else:
    266 		self.language = self.subtags[0]
    267 		self.script = self._find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtags)
    268 		self.region = self._find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtags[1:])
    269 		self.variant = self._find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtags)
    270 
    271 def __str__(self):
    272 	return '-'.join(self.subtags)
    273 
    274 def __repr__ (self):
    275 	return 'LanguageTag(%r)' % str(self)
    276 
    277 @staticmethod
    278 def _find_first (function, sequence):
    279 	try:
    280 		return next (iter (filter (function, sequence)))
    281 	except StopIteration:
    282 		return None
    283 
    284 def is_complex (self):
    285 	"""Return whether this tag is too complex to represent as a
    286 	``LangTag`` in the generated code.
    287 
    288 	Complex tags need to be handled in
    289 	``hb_ot_tags_from_complex_language``.
    290 
    291 	Returns:
    292 		Whether this tag is complex.
    293 	"""
    294 	return not (len (self.subtags) == 1
    295 		or self.grandfathered
    296 		and len (self.subtags[1]) != 3
    297 		and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
    298 
    299 def get_group (self):
    300 	"""Return the group into which this tag should be categorized in
    301 	``hb_ot_tags_from_complex_language``.
    302 
    303 	The group is the first letter of the tag, or ``'und'`` if this tag
    304 	should not be matched in a ``switch`` statement in the generated
    305 	code.
    306 
    307 	Returns:
    308 		This tag's group.
    309 	"""
    310 	return ('und'
    311 		if (self.language == 'und'
    312 			or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
    313 		else self.language[0])
    314 
    315 class OpenTypeRegistryParser (HTMLParser):
    316 """A parser for the OpenType language system tag registry.
    317 
    318 Attributes:
    319 	header (str): The "last updated" line of the registry.
    320 	names (Mapping[str, str]): A map of language system tags to the
    321 		names they are given in the registry.
    322 	ranks (DefaultDict[str, int]): A map of language system tags to
    323 		numbers. If a single BCP 47 tag corresponds to multiple
    324 		OpenType tags, the tags are ordered in increasing order by
    325 		rank. The rank is based on the number of BCP 47 tags
    326 		associated with a tag, though it may be manually modified.
    327 	to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
    328 		OpenType language system tags to sets of BCP 47 tags.
    329 	from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
    330 		inverted. Its values start as unsorted sets;
    331 		``sort_languages`` converts them to sorted lists.
    332 	from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
    333 		A copy of ``from_bcp_47``. It starts as ``None`` and is
    334 		populated at the beginning of the first call to
    335 		``inherit_from_macrolanguages``.
    336 
    337 """
    338 def __init__ (self):
    339 	HTMLParser.__init__ (self)
    340 	self.header = ''
    341 	self.names = {}
    342 	self.ranks = collections.defaultdict (int)
    343 	self.to_bcp_47 = collections.defaultdict (set)
    344 	self.from_bcp_47 = collections.defaultdict (set)
    345 	self.from_bcp_47_uninherited = None
    346 	# Whether the parser is in a <td> element
    347 	self._td = False
    348 	# Whether the parser ignores the rest of the current <td> element
    349 	self._disengaged = False
    350 	# The text of the <td> elements of the current <tr> element.
    351 	self._current_tr = []
    352 
    353 def handle_starttag (self, tag, attrs):
    354 	if tag == 'a':
    355 		if self._current_tr and not self._disengaged:
    356 			self._current_tr[-1] = ''
    357 			self._disengaged = True
    358 	elif tag == 'br':
    359 		self._disengaged = True
    360 	elif tag == 'meta':
    361 		for attr, value in attrs:
    362 			if attr == 'name' and value == 'updated_at':
    363 				self.header = self.get_starttag_text ()
    364 				break
    365 	elif tag == 'td':
    366 		self._td = True
    367 		self._current_tr.append ('')
    368 	elif tag == 'tr':
    369 		self._disengaged = False
    370 		self._current_tr = []
    371 
    372 def handle_endtag (self, tag):
    373 	if tag == 'td':
    374 		self._td = False
    375 		self._disengaged = False
    376 	elif tag == 'tr' and self._current_tr:
    377 		expect (2 <= len (self._current_tr) <= 3)
    378 		name = self._current_tr[0].strip ()
    379 		tag = self._current_tr[1].strip ("\t\n\v\f\r '")
    380 		rank = 0
    381 		if len (tag) > 4:
    382 			expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
    383 			name += ' (deprecated)'
    384 			tag = tag.split (' ')[0]
    385 			rank = 1
    386 		self.names[tag] = re.sub (' languages$', '', name)
    387 		if not self._current_tr[2]:
    388 			return
    389 		iso_codes = self._current_tr[2].strip ()
    390 		self.to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
    391 		rank += 2 * len (self.to_bcp_47[tag])
    392 		self.ranks[tag] = rank
    393 
    394 def handle_data (self, data):
    395 	if self._td and not self._disengaged:
    396 		self._current_tr[-1] += data
    397 
    398 def handle_charref (self, name):
    399 	self.handle_data (html.unescape ('&#%s;' % name))
    400 
    401 def handle_entityref (self, name):
    402 	self.handle_data (html.unescape ('&%s;' % name))
    403 
    404 def parse (self, filename):
    405 	"""Parse the OpenType language system tag registry.
    406 
    407 	Args:
    408 		filename (str): The file name of the registry.
    409 	"""
    410 	with open (filename, encoding='utf-8') as f:
    411 		self.feed (f.read ())
    412 	expect (self.header)
    413 	for tag, iso_codes in self.to_bcp_47.items ():
    414 		for iso_code in iso_codes:
    415 			self.from_bcp_47[iso_code].add (tag)
    416 
    417 def add_language (self, bcp_47_tag, ot_tag):
    418 	"""Add a language as if it were in the registry.
    419 
    420 	Args:
    421 		bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
    422 			a language subtag, and if the language subtag is a
    423 			macrolanguage, then new languages are added corresponding
    424 			to the macrolanguages' individual languages with the
    425 			remainder of the tag appended.
    426 		ot_tag (str): An OpenType language system tag.
    427 	"""
    428 	global bcp_47
    429 	self.to_bcp_47[ot_tag].add (bcp_47_tag)
    430 	self.from_bcp_47[bcp_47_tag].add (ot_tag)
    431 	if bcp_47_tag.lower () not in bcp_47.grandfathered:
    432 		try:
    433 			[macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
    434 			if macrolanguage in bcp_47.macrolanguages:
    435 				s = set ()
    436 				for language in bcp_47.macrolanguages[macrolanguage]:
    437 					if language.lower () not in bcp_47.grandfathered:
    438 						s.add ('%s-%s' % (language, suffix))
    439 				bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
    440 		except ValueError:
    441 			pass
    442 
    443 @staticmethod
    444 def _remove_language (tag_1, dict_1, dict_2):
    445 	for tag_2 in dict_1.pop (tag_1):
    446 		dict_2[tag_2].remove (tag_1)
    447 		if not dict_2[tag_2]:
    448 			del dict_2[tag_2]
    449 
    450 def remove_language_ot (self, ot_tag):
    451 	"""Remove an OpenType tag from the registry.
    452 
    453 	Args:
    454 		ot_tag (str): An OpenType tag.
    455 	"""
    456 	self._remove_language (ot_tag, self.to_bcp_47, self.from_bcp_47)
    457 
    458 def remove_language_bcp_47 (self, bcp_47_tag):
    459 	"""Remove a BCP 47 tag from the registry.
    460 
    461 	Args:
    462 		bcp_47_tag (str): A BCP 47 tag.
    463 	"""
    464 	self._remove_language (bcp_47_tag, self.from_bcp_47, self.to_bcp_47)
    465 
    466 def inherit_from_macrolanguages (self):
    467 	"""Copy mappings from macrolanguages to individual languages.
    468 
    469 	If a BCP 47 tag for an individual mapping has no OpenType
    470 	mapping but its macrolanguage does, the mapping is copied to
    471 	the individual language. For example, als (Tosk Albanian) has no
    472 	explicit mapping, so it inherits from sq (Albanian) the mapping
    473 	to SQI.
    474 
    475 	However, if an OpenType tag maps to a BCP 47 macrolanguage and
    476 	some but not all of its individual languages, the mapping is not
    477 	inherited from the macrolanguage to the missing individual
    478 	languages. For example, INUK (Nunavik Inuktitut) is mapped to
    479 	ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
    480 	ikt (Inuinnaqtun, which is an individual language of iu), so
    481 	this method does not add a mapping from ikt to INUK.
    482 
    483 	If a BCP 47 tag for a macrolanguage has no OpenType mapping but
    484 	some of its individual languages do, their mappings are copied
    485 	to the macrolanguage.
    486 	"""
    487 	global bcp_47
    488 	first_time = self.from_bcp_47_uninherited is None
    489 	if first_time:
    490 		self.from_bcp_47_uninherited = dict (self.from_bcp_47)
    491 	for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
    492 		ot_macrolanguages = {
    493 			ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
    494 		}
    495 		blocked_ot_macrolanguages = set ()
    496 		if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
    497 			for ot_macrolanguage in ot_macrolanguages:
    498 				round_trip_macrolanguages = {
    499 					l for l in self.to_bcp_47[ot_macrolanguage]
    500 					if 'retired code' not in bcp_47.scopes.get (l, '')
    501 				}
    502 				round_trip_languages = {
    503 					l for l in languages
    504 					if 'retired code' not in bcp_47.scopes.get (l, '')
    505 				}
    506 				intersection = round_trip_macrolanguages & round_trip_languages
    507 				if intersection and intersection != round_trip_languages:
    508 					blocked_ot_macrolanguages.add (ot_macrolanguage)
    509 		if ot_macrolanguages:
    510 			for ot_macrolanguage in ot_macrolanguages:
    511 				if ot_macrolanguage not in blocked_ot_macrolanguages:
    512 					for language in languages:
    513 						self.add_language (language, ot_macrolanguage)
    514 						if not blocked_ot_macrolanguages:
    515 							self.ranks[ot_macrolanguage] += 1
    516 		elif first_time:
    517 			for language in languages:
    518 				if language in self.from_bcp_47_uninherited:
    519 					ot_macrolanguages |= self.from_bcp_47_uninherited[language]
    520 				else:
    521 					ot_macrolanguages.clear ()
    522 				if not ot_macrolanguages:
    523 					break
    524 			for ot_macrolanguage in ot_macrolanguages:
    525 				self.add_language (macrolanguage, ot_macrolanguage)
    526 
    527 def sort_languages (self):
    528 	"""Sort the values of ``from_bcp_47`` in ascending rank order."""
    529 	for language, tags in self.from_bcp_47.items ():
    530 		self.from_bcp_47[language] = sorted (tags,
    531 				key=lambda t: (self.ranks[t] + rank_delta (language, t), t))
    532 
    533 ot = OpenTypeRegistryParser ()
    534 
    535 class BCP47Parser (object):
    536 """A parser for the BCP 47 subtag registry.
    537 
    538 Attributes:
    539 	header (str): The "File-Date" line of the registry.
    540 	names (Mapping[str, str]): A map of subtags to the names they
    541 		are given in the registry. Each value is a
    542 		``'\\n'``-separated list of names.
    543 	scopes (Mapping[str, str]): A map of language subtags to strings
    544 		suffixed to language names, including suffixes to explain
    545 		language scopes.
    546 	macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
    547 		language subtags to the sets of language subtags which
    548 		inherit from them. See
    549 		``OpenTypeRegistryParser.inherit_from_macrolanguages``.
    550 	prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
    551 		subtags to their prefixes.
    552 	grandfathered (AbstractSet[str]): The set of grandfathered tags,
    553 		normalized to lowercase.
    554 
    555 """
    556 def __init__ (self):
    557 	self.header = ''
    558 	self.names = {}
    559 	self.scopes = {}
    560 	self.macrolanguages = collections.defaultdict (set)
    561 	self.prefixes = collections.defaultdict (set)
    562 	self.grandfathered = set ()
    563 
    564 def parse (self, filename):
    565 	"""Parse the BCP 47 subtag registry.
    566 
    567 	Args:
    568 		filename (str): The file name of the registry.
    569 	"""
    570 	with open (filename, encoding='utf-8') as f:
    571 		subtag_type = None
    572 		subtag = None
    573 		deprecated = False
    574 		has_preferred_value = False
    575 		line_buffer = ''
    576 		for line in itertools.chain (f, ['']):
    577 			line = line.rstrip ()
    578 			if line.startswith (' '):
    579 				line_buffer += line[1:]
    580 				continue
    581 			line, line_buffer = line_buffer, line
    582 			if line.startswith ('Type: '):
    583 				subtag_type = line.split (' ')[1]
    584 				deprecated = False
    585 				has_preferred_value = False
    586 			elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
    587 				subtag = line.split (' ')[1]
    588 				if subtag_type == 'grandfathered':
    589 					self.grandfathered.add (subtag.lower ())
    590 			elif line.startswith ('Description: '):
    591 				description = line.split (' ', 1)[1].replace (' (individual language)', '')
    592 				description = re.sub (r' (\(family\)|\((individual |macro)language\)|languages)$', '',
    593 						description)
    594 				if subtag in self.names:
    595 					self.names[subtag] += '\n' + description
    596 				else:
    597 					self.names[subtag] = description
    598 			elif subtag_type == 'language' or subtag_type == 'grandfathered':
    599 				if line.startswith ('Scope: '):
    600 					scope = line.split (' ')[1]
    601 					if scope == 'macrolanguage':
    602 						scope = ' [macrolanguage]'
    603 					elif scope == 'collection':
    604 						scope = ' [collection]'
    605 					else:
    606 						continue
    607 					self.scopes[subtag] = scope
    608 				elif line.startswith ('Deprecated: '):
    609 					self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
    610 					deprecated = True
    611 				elif deprecated and line.startswith ('Comments: see '):
    612 					# If a subtag is split into multiple replacement subtags,
    613 					# it essentially represents a macrolanguage.
    614 					for language in line.replace (',', '').split (' ')[2:]:
    615 						self._add_macrolanguage (subtag, language)
    616 				elif line.startswith ('Preferred-Value: '):
    617 					# If a subtag is deprecated in favor of a single replacement subtag,
    618 					# it is either a dialect or synonym of the preferred subtag. Either
    619 					# way, it is close enough to the truth to consider the replacement
    620 					# the macrolanguage of the deprecated language.
    621 					has_preferred_value = True
    622 					macrolanguage = line.split (' ')[1]
    623 					self._add_macrolanguage (macrolanguage, subtag)
    624 				elif not has_preferred_value and line.startswith ('Macrolanguage: '):
    625 					self._add_macrolanguage (line.split (' ')[1], subtag)
    626 			elif subtag_type == 'variant':
    627 				if line.startswith ('Deprecated: '):
    628 					self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
    629 				elif line.startswith ('Prefix: '):
    630 					self.prefixes[subtag].add (line.split (' ')[1])
    631 			elif line.startswith ('File-Date: '):
    632 				self.header = line
    633 	expect (self.header)
    634 
    635 def _add_macrolanguage (self, macrolanguage, language):
    636 	global ot
    637 	if language not in ot.from_bcp_47:
    638 		for l in self.macrolanguages.get (language, set ()):
    639 			self._add_macrolanguage (macrolanguage, l)
    640 	if macrolanguage not in ot.from_bcp_47:
    641 		for ls in list (self.macrolanguages.values ()):
    642 			if macrolanguage in ls:
    643 				ls.add (language)
    644 				return
    645 	self.macrolanguages[macrolanguage].add (language)
    646 
    647 def remove_extra_macrolanguages (self):
    648 	"""Make every language have at most one macrolanguage."""
    649 	inverted = collections.defaultdict (list)
    650 	for macrolanguage, languages in self.macrolanguages.items ():
    651 		for language in languages:
    652 			inverted[language].append (macrolanguage)
    653 	for language, macrolanguages in inverted.items ():
    654 		if len (macrolanguages) > 1:
    655 			macrolanguages.sort (key=lambda ml: len (self.macrolanguages[ml]))
    656 			biggest_macrolanguage = macrolanguages.pop ()
    657 			for macrolanguage in macrolanguages:
    658 				self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
    659 
    660 def _get_name_piece (self, subtag):
    661 	"""Return the first name of a subtag plus its scope suffix.
    662 
    663 	Args:
    664 		subtag (str): A BCP 47 subtag.
    665 
    666 	Returns:
    667 		The name form of ``subtag``.
    668 	"""
    669 	return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
    670 
    671 def get_name (self, lt):
    672 	"""Return the names of the subtags in a language tag.
    673 
    674 	Args:
    675 		lt (LanguageTag): A BCP 47 language tag.
    676 
    677 	Returns:
    678 		The name form of ``lt``.
    679 	"""
    680 	name = self._get_name_piece (lt.language)
    681 	if lt.script:
    682 		name += '; ' + self._get_name_piece (lt.script.title ())
    683 	if lt.region:
    684 		name += '; ' + self._get_name_piece (lt.region.upper ())
    685 	if lt.variant:
    686 		name += '; ' + self._get_name_piece (lt.variant)
    687 	return name
    688 
    689 bcp_47 = BCP47Parser ()
    690 
    691 ot.parse (sys.argv[1])
    692 bcp_47.parse (sys.argv[2])
    693 
    694 ot.add_language ('ary', 'MOR')
    695 
    696 ot.add_language ('ath', 'ATH')
    697 
    698 ot.add_language ('bai', 'BML')
    699 
    700 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
    701 
    702 ot.add_language ('ber', 'BBR')
    703 
    704 ot.remove_language_ot ('PGR')
    705 ot.add_language ('el-polyton', 'PGR')
    706 
    707 bcp_47.names['flm'] = 'Falam Chin'
    708 bcp_47.scopes['flm'] = ' (retired code)'
    709 bcp_47.macrolanguages['flm'] = {'cfm'}
    710 
    711 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
    712 
    713 ot.add_language ('und-fonipa', 'IPPH')
    714 
    715 ot.add_language ('und-fonnapa', 'APPH')
    716 
    717 ot.add_language ('ga-Latg', 'IRT')
    718 
    719 ot.add_language ('hy-arevmda', 'HYE')
    720 
    721 ot.remove_language_ot ('KGE')
    722 ot.add_language ('und-Geok', 'KGE')
    723 
    724 ot.add_language ('kht', 'KHN')
    725 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
    726 ot.ranks['KHN'] = ot.ranks['KHT'] + 1
    727 
    728 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
    729 
    730 ot.names['MAL'] = 'Malayalam Traditional'
    731 ot.ranks['MLR'] += 1
    732 
    733 bcp_47.names['mhv'] = 'Arakanese'
    734 bcp_47.scopes['mhv'] = ' (retired code)'
    735 
    736 ot.add_language ('mnw-TH', 'MONT')
    737 
    738 ot.add_language ('no', 'NOR')
    739 
    740 ot.add_language ('oc-provenc', 'PRO')
    741 
    742 ot.remove_language_ot ('QUZ')
    743 ot.add_language ('qu', 'QUZ')
    744 ot.add_language ('qub', 'QWH')
    745 ot.add_language ('qud', 'QVI')
    746 ot.add_language ('qug', 'QVI')
    747 ot.add_language ('qul', 'QUH')
    748 ot.add_language ('qup', 'QVI')
    749 ot.add_language ('qur', 'QWH')
    750 ot.add_language ('qus', 'QUH')
    751 ot.add_language ('quw', 'QVI')
    752 ot.add_language ('qux', 'QWH')
    753 ot.add_language ('qva', 'QWH')
    754 ot.add_language ('qvh', 'QWH')
    755 ot.add_language ('qvj', 'QVI')
    756 ot.add_language ('qvl', 'QWH')
    757 ot.add_language ('qvm', 'QWH')
    758 ot.add_language ('qvn', 'QWH')
    759 ot.add_language ('qvo', 'QVI')
    760 ot.add_language ('qvp', 'QWH')
    761 ot.add_language ('qvw', 'QWH')
    762 ot.add_language ('qvz', 'QVI')
    763 ot.add_language ('qwa', 'QWH')
    764 ot.add_language ('qws', 'QWH')
    765 ot.add_language ('qxa', 'QWH')
    766 ot.add_language ('qxc', 'QWH')
    767 ot.add_language ('qxh', 'QWH')
    768 ot.add_language ('qxl', 'QVI')
    769 ot.add_language ('qxn', 'QWH')
    770 ot.add_language ('qxo', 'QWH')
    771 ot.add_language ('qxr', 'QVI')
    772 ot.add_language ('qxt', 'QWH')
    773 ot.add_language ('qxw', 'QWH')
    774 
    775 bcp_47.macrolanguages['ro-MD'].add ('mo')
    776 
    777 ot.remove_language_ot ('SYRE')
    778 ot.remove_language_ot ('SYRJ')
    779 ot.remove_language_ot ('SYRN')
    780 ot.add_language ('und-Syre', 'SYRE')
    781 ot.add_language ('und-Syrj', 'SYRJ')
    782 ot.add_language ('und-Syrn', 'SYRN')
    783 
    784 bcp_47.names['xst'] = "Silt'e"
    785 bcp_47.scopes['xst'] = ' (retired code)'
    786 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
    787 
    788 ot.add_language ('xwo', 'TOD')
    789 
    790 ot.remove_language_ot ('ZHH')
    791 ot.remove_language_ot ('ZHP')
    792 ot.remove_language_ot ('ZHT')
    793 ot.remove_language_ot ('ZHTM')
    794 bcp_47.macrolanguages['zh'].remove ('lzh')
    795 bcp_47.macrolanguages['zh'].remove ('yue')
    796 ot.add_language ('zh-Hant-MO', 'ZHH')
    797 ot.add_language ('zh-Hant-MO', 'ZHTM')
    798 ot.add_language ('zh-Hant-HK', 'ZHH')
    799 ot.add_language ('zh-Hans', 'ZHS')
    800 ot.add_language ('zh-Hant', 'ZHT')
    801 ot.add_language ('zh-HK', 'ZHH')
    802 ot.add_language ('zh-MO', 'ZHH')
    803 ot.add_language ('zh-MO', 'ZHTM')
    804 ot.add_language ('zh-TW', 'ZHT')
    805 ot.add_language ('lzh', 'ZHT')
    806 ot.add_language ('lzh-Hans', 'ZHS')
    807 ot.add_language ('yue', 'ZHH')
    808 ot.add_language ('yue-Hans', 'ZHS')
    809 
    810 def rank_delta (bcp_47, ot):
    811 """Return a delta to apply to a BCP 47 tag's rank.
    812 
    813 Most OpenType tags have a constant rank, but a few have ranks that
    814 depend on the BCP 47 tag.
    815 
    816 Args:
    817 	bcp_47 (str): A BCP 47 tag.
    818 	ot (str): An OpenType tag to.
    819 
    820 Returns:
    821 	A number to add to ``ot``'s rank when sorting ``bcp_47``'s
    822 	OpenType equivalents.
    823 """
    824 if bcp_47 == 'ak' and ot == 'AKA':
    825 	return -1
    826 if bcp_47 == 'tw' and ot == 'TWI':
    827 	return -1
    828 return 0
    829 
    830 disambiguation = {
    831 'ALT': 'alt',
    832 'ARK': 'rki',
    833 'ATH': 'ath',
    834 'BHI': 'bhb',
    835 'BLN': 'bjt',
    836 'BTI': 'beb',
    837 'CCHN': 'cco',
    838 'CMR': 'swb',
    839 'CPP': 'crp',
    840 'CRR': 'crx',
    841 'DUJ': 'dwu',
    842 'ECR': 'crj',
    843 'HAL': 'cfm',
    844 'HND': 'hnd',
    845 'HYE': 'hyw',
    846 'KIS': 'kqs',
    847 'KUI': 'uki',
    848 'LRC': 'bqi',
    849 'NDB': 'nd',
    850 'NIS': 'njz',
    851 'PLG': 'pce',
    852 'PRO': 'pro',
    853 'QIN': 'bgr',
    854 'QUH': 'quh',
    855 'QVI': 'qvi',
    856 'QWH': 'qwh',
    857 'SIG': 'stv',
    858 'SRB': 'sr',
    859 'SXT': 'xnj',
    860 'ZHH': 'zh-HK',
    861 'ZHS': 'zh-Hans',
    862 'ZHT': 'zh-Hant',
    863 'ZHTM': 'zh-MO',
    864 }
    865 
    866 ot.inherit_from_macrolanguages ()
    867 bcp_47.remove_extra_macrolanguages ()
    868 ot.inherit_from_macrolanguages ()
    869 ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
    870 ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
    871 for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
    872 possible_bcp_47_tag = tricky_ot_tag.lower ()
    873 if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
    874 	ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
    875 	bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
    876 ot.sort_languages ()
    877 
    878 print ('/* == Start of generated table == */')
    879 print ('/*')
    880 print (' * The following table is generated by running:')
    881 print (' *')
    882 print (' *   %s languagetags language-subtag-registry' % sys.argv[0])
    883 print (' *')
    884 print (' * on files with these headers:')
    885 print (' *')
    886 print (' * %s' % ot.header.strip ())
    887 print (' * %s' % bcp_47.header)
    888 print (' */')
    889 print ()
    890 print ('#ifndef HB_OT_TAG_TABLE_HH')
    891 print ('#define HB_OT_TAG_TABLE_HH')
    892 print ()
    893 
    894 def hb_tag (tag):
    895 """Convert a tag to ``HB_TAG`` form.
    896 
    897 Args:
    898 	tag (str): An OpenType tag.
    899 
    900 Returns:
    901 	A snippet of C++ representing ``tag``.
    902 """
    903 if tag == DEFAULT_LANGUAGE_SYSTEM:
    904 	return 'HB_TAG_NONE\t       '
    905 return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
    906 
    907 def get_variant_set (name):
    908 """Return a set of variant language names from a name.
    909 
    910 Args:
    911 	name (str): A list of language names from the BCP 47 registry,
    912 		joined on ``'\\n'``.
    913 
    914 Returns:
    915 	A set of normalized language names.
    916 """
    917 return set (unicodedata.normalize (
    918 			'NFD',
    919 			n.replace ('\u02BC', "'").replace ('\u2019', "'"),
    920 		)
    921 		.encode ('ASCII', 'ignore')
    922 		.strip ()
    923 		for n in re.split ('[\n(),]', name) if n)
    924 
    925 def language_name_intersection (a, b):
    926 """Return the names in common between two language names.
    927 
    928 Args:
    929 	a (str): A list of language names from the BCP 47 registry,
    930 		joined on ``'\\n'``.
    931 	b (str): A list of language names from the BCP 47 registry,
    932 		joined on ``'\\n'``.
    933 
    934 Returns:
    935 	The normalized language names shared by ``a`` and ``b``.
    936 """
    937 return get_variant_set (a).intersection (get_variant_set (b))
    938 
    939 def get_matching_language_name (intersection, candidates):
    940 return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
    941 
    942 def same_tag (bcp_47_tag, ot_tags):
    943 return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
    944 
    945 for language_len in (2, 3):
    946 if language_len == 3:
    947 	print ('#ifndef HB_NO_LANGUAGE_LONG')
    948 print ('static const LangTag ot_languages%d[] = {' % language_len)
    949 for language, tags in sorted (ot.from_bcp_47.items ()):
    950 	if language == '' or '-' in language:
    951 		continue
    952 	if len(language) != language_len: continue
    953 	commented_out = same_tag (language, tags)
    954 	for i, tag in enumerate (tags, start=1):
    955 		print ('%s{%s,\t%s},' % ('/*' if commented_out else '  ', hb_tag (language), hb_tag (tag)), end='')
    956 		if commented_out:
    957 			print ('*/', end='')
    958 		print ('\t/* ', end='')
    959 		bcp_47_name = bcp_47.names.get (language, '')
    960 		bcp_47_name_candidates = bcp_47_name.split ('\n')
    961 		ot_name = ot.names[tag]
    962 		scope = bcp_47.scopes.get (language, '')
    963 		if tag == DEFAULT_LANGUAGE_SYSTEM:
    964 			write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
    965 		else:
    966 			intersection = language_name_intersection (bcp_47_name, ot_name)
    967 			if not intersection:
    968 				write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
    969 			else:
    970 				name = get_matching_language_name (intersection, bcp_47_name_candidates)
    971 				bcp_47.names[language] = name
    972 				write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
    973 		print (' */')
    974 print ('};')
    975 if language_len == 3:
    976 	print ('#endif')
    977 print ()
    978 
    979 print ('/**')
    980 print (' * hb_ot_tags_from_complex_language:')
    981 print (' * @lang_str: a BCP 47 language tag to convert.')
    982 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
    983 print (' * conversion.')
    984 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
    985 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
    986 print (' * @tags: array of size at least @language_count to store the language tag')
    987 print (' * results')
    988 print (' *')
    989 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
    990 print (' *')
    991 print (' * Return value: Whether any language systems were retrieved.')
    992 print (' **/')
    993 print ('static inline bool')
    994 print ('hb_ot_tags_from_complex_language (const char   *lang_str,')
    995 print ('\t\t\t\t  const char   *limit,')
    996 print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
    997 print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
    998 print ('{')
    999 
   1000 def print_subtag_matches (subtag, string, new_line):
   1001 if subtag:
   1002 	if new_line:
   1003 		print ()
   1004 		print ('\t&& ', end='')
   1005 	print ('subtag_matches (%s, limit, "-%s", %i)' % (string, subtag, 1 + len (subtag)), end='')
   1006 
   1007 complex_tags = collections.defaultdict (list)
   1008 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
   1009 		(LanguageTag (language), tags)
   1010 		for language, tags in sorted (ot.from_bcp_47.items (),
   1011 			key=lambda i: (-len (i[0]), i[0]))
   1012 	] if lt_tags[0].is_complex ()),
   1013 	key=lambda lt_tags: lt_tags[0].get_group ()):
   1014 complex_tags[initial] += group
   1015 
   1016 # Calculate the min length of the subtags outside the switch
   1017 min_subtag_len = 100
   1018 for initial, items in sorted (complex_tags.items ()):
   1019 if initial != 'und':
   1020 	continue
   1021 for lt, tags in items:
   1022 	if not tags:
   1023 		continue
   1024 	subtag_len = 0
   1025 	subtag_len += 1 + len (lt.script) if lt.script is not None else 0
   1026 	subtag_len += 1 + len (lt.region) if lt.region is not None else 0
   1027 	subtag_len += 1 + len (lt.variant) if lt.variant is not None else 0
   1028 	min_subtag_len = min(subtag_len, min_subtag_len)
   1029 
   1030 print ('  if (limit - lang_str >= %d)' % (min_subtag_len + 2))
   1031 print ('  {')
   1032 print ("    const char *p = strchr (lang_str, '-');")
   1033 print ("    if (!p || p >= limit || limit - p < %i) goto out;" % min_subtag_len)
   1034 for initial, items in sorted (complex_tags.items ()):
   1035 if initial != 'und':
   1036 	continue
   1037 for lt, tags in items:
   1038 	if not tags:
   1039 		continue
   1040 	if lt.variant in bcp_47.prefixes:
   1041 		expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
   1042 				'%s is not a valid prefix of %s' % (lt.language, lt.variant))
   1043 	print ('    if (', end='')
   1044 	print_subtag_matches (lt.script, 'p', False)
   1045 	print_subtag_matches (lt.region, 'p', False)
   1046 	print_subtag_matches (lt.variant, 'p', False)
   1047 	print (')')
   1048 	print ('    {')
   1049 	write ('      /* %s */' % bcp_47.get_name (lt))
   1050 	print ()
   1051 	if len (tags) == 1:
   1052 		write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
   1053 		print ()
   1054 		print ('      *count = 1;')
   1055 	else:
   1056 		print ('    hb_tag_t possible_tags[] = {')
   1057 		for tag in tags:
   1058 			write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
   1059 			print ()
   1060 		print ('      };')
   1061 		print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
   1062 		print ('\ttags[i] = possible_tags[i];')
   1063 		print ('      *count = i;')
   1064 	print ('      return true;')
   1065 	print ('    }')
   1066 print ('  }')
   1067 print ('out:')
   1068 
   1069 print ('  switch (lang_str[0])')
   1070 print ('  {')
   1071 for initial, items in sorted (complex_tags.items ()):
   1072 if initial == 'und':
   1073 	continue
   1074 print ("  case '%s':" % initial)
   1075 for lt, tags in items:
   1076 	if not tags:
   1077 		continue
   1078 	print ('    if (', end='')
   1079 	script = lt.script
   1080 	region = lt.region
   1081 	if lt.grandfathered:
   1082 		print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
   1083 	else:
   1084 		string_literal = lt.language[1:] + '-'
   1085 		if script:
   1086 			string_literal += script
   1087 			script = None
   1088 			if region:
   1089 				string_literal += '-' + region
   1090 				region = None
   1091 		if string_literal[-1] == '-':
   1092 			print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
   1093 		else:
   1094 			print ('lang_matches (&lang_str[1], limit, "%s", %i)' % (string_literal, len (string_literal)), end='')
   1095 	print_subtag_matches (script, 'lang_str', True)
   1096 	print_subtag_matches (region, 'lang_str', True)
   1097 	print_subtag_matches (lt.variant, 'lang_str', True)
   1098 	print (')')
   1099 	print ('    {')
   1100 	write ('      /* %s */' % bcp_47.get_name (lt))
   1101 	print ()
   1102 	if len (tags) == 1:
   1103 		write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
   1104 		print ()
   1105 		print ('      *count = 1;')
   1106 	else:
   1107 		print ('      unsigned int i;')
   1108 		print ('      hb_tag_t possible_tags[] = {')
   1109 		for tag in tags:
   1110 			write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
   1111 			print ()
   1112 		print ('      };')
   1113 		print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
   1114 		print ('\ttags[i] = possible_tags[i];')
   1115 		print ('      *count = i;')
   1116 	print ('      return true;')
   1117 	print ('    }')
   1118 print ('    break;')
   1119 
   1120 print ('  }')
   1121 print ('  return false;')
   1122 print ('}')
   1123 print ()
   1124 print ('/**')
   1125 print (' * hb_ot_ambiguous_tag_to_language')
   1126 print (' * @tag: A language tag.')
   1127 print (' *')
   1128 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
   1129 print (' * many language tags) and the best tag is not the first (sorted alphabetically,')
   1130 print (' * with two-letter tags having priority over all three-letter tags), or if the')
   1131 print (' * best tag consists of multiple subtags, or if the best tag does not appear in')
   1132 print (' * #ot_languages2 or #ot_languages3.')
   1133 print (' *')
   1134 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
   1135 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
   1136 print (' **/')
   1137 print ('static inline hb_language_t')
   1138 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
   1139 print ('{')
   1140 print ('  switch (tag)')
   1141 print ('  {')
   1142 
   1143 def verify_disambiguation_dict ():
   1144 """Verify and normalize ``disambiguation``.
   1145 
   1146 ``disambiguation`` is a map of ambiguous OpenType language system
   1147 tags to the particular BCP 47 tags they correspond to. This function
   1148 checks that all its keys really are ambiguous and that each key's
   1149 value is valid for that key. It checks that no ambiguous tag is
   1150 missing, except when it can figure out which BCP 47 tag is the best
   1151 by itself.
   1152 
   1153 It modifies ``disambiguation`` to remove keys whose values are the
   1154 same as those that the fallback would return anyway, and to add
   1155 ambiguous keys whose disambiguations it determined automatically.
   1156 
   1157 Raises:
   1158 	AssertionError: Verification failed.
   1159 """
   1160 global bcp_47
   1161 global disambiguation
   1162 global ot
   1163 for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
   1164 	if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
   1165 		primary_tags = []
   1166 	else:
   1167 		primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
   1168 	if len (primary_tags) == 1:
   1169 		expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
   1170 		if '-' in primary_tags[0]:
   1171 			disambiguation[ot_tag] = primary_tags[0]
   1172 		else:
   1173 			first_tag = sorted ((t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t)),
   1174 					key=lambda t: (len (t), t))[0]
   1175 			if primary_tags[0] != first_tag:
   1176 				disambiguation[ot_tag] = primary_tags[0]
   1177 	elif len (primary_tags) == 0:
   1178 		expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
   1179 	else:
   1180 		original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
   1181 		if len (original_languages) == 1:
   1182 			macrolanguages = original_languages
   1183 		else:
   1184 			macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
   1185 		if len (macrolanguages) != 1:
   1186 			macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [collection]')
   1187 		if len (macrolanguages) != 1:
   1188 			macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
   1189 		if len (macrolanguages) != 1:
   1190 			macrolanguages = list (t for t in primary_tags if t.lower () == ISO_639_3_TO_1.get (ot_tag.lower (), ot_tag.lower ()))
   1191 		if len (macrolanguages) != 1:
   1192 			macrolanguages = list (t for t in primary_tags if '-' not in t)
   1193 		if len (macrolanguages) != 1:
   1194 			expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, sorted (primary_tags)))
   1195 			expect (disambiguation[ot_tag] in bcp_47_tags,
   1196 					'%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
   1197 		elif ot_tag not in disambiguation:
   1198 			disambiguation[ot_tag] = macrolanguages[0]
   1199 		if '-' not in disambiguation[ot_tag]:
   1200 			different_bcp_47_tags = sorted ((t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t))),
   1201 					key=lambda t: (len (t), t))
   1202 			if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0]:
   1203 				del disambiguation[ot_tag]
   1204 for ot_tag in disambiguation.keys ():
   1205 	expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
   1206 
   1207 verify_disambiguation_dict ()
   1208 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
   1209 write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
   1210 print ()
   1211 write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
   1212 print ()
   1213 
   1214 print ('  default:')
   1215 print ('    return HB_LANGUAGE_INVALID;')
   1216 print ('  }')
   1217 print ('}')
   1218 
   1219 print ()
   1220 print ('#endif /* HB_OT_TAG_TABLE_HH */')
   1221 print ()
   1222 print ('/* == End of generated table == */')