tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-vowel-constraints.py (7634B)


      1 #!/usr/bin/env python3
      2 
      3 """Generator of the function to prohibit certain vowel sequences.
      4 
      5 It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
      6 circles into sequences prohibited by the USE script development spec.
      7 This function should be used as the ``preprocess_text`` of an
      8 ``hb_ot_shaper_t``.
      9 
     10 usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
     11 
     12 Input file:
     13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
     14 """
     15 
     16 import collections
     17 def write (s):
     18 sys.stdout.flush ()
     19 sys.stdout.buffer.write (s.encode ('utf-8'))
     20 import sys
     21 
     22 if len (sys.argv) != 3:
     23 sys.exit (__doc__)
     24 
     25 with open (sys.argv[2], encoding='utf-8') as f:
     26 scripts_header = [f.readline () for i in range (2)]
     27 scripts = {}
     28 script_order = {}
     29 for line in f:
     30 	j = line.find ('#')
     31 	if j >= 0:
     32 		line = line[:j]
     33 	fields = [x.strip () for x in line.split (';')]
     34 	if len (fields) == 1:
     35 		continue
     36 	uu = fields[0].split ('..')
     37 	start = int (uu[0], 16)
     38 	if len (uu) == 1:
     39 		end = start
     40 	else:
     41 		end = int (uu[1], 16)
     42 	script = fields[1]
     43 	for u in range (start, end + 1):
     44 		scripts[u] = script
     45 	if script not in script_order:
     46 		script_order[script] = start
     47 
     48 class ConstraintSet (object):
     49 """A set of prohibited code point sequences.
     50 
     51 Args:
     52 	constraint (List[int]): A prohibited code point sequence.
     53 
     54 """
     55 def __init__ (self, constraint):
     56 	# Either a list or a dictionary. As a list of code points, it
     57 	# represents a prohibited code point sequence. As a dictionary,
     58 	# it represents a set of prohibited sequences, where each item
     59 	# represents the set of prohibited sequences starting with the
     60 	# key (a code point) concatenated with any of the values
     61 	# (ConstraintSets).
     62 	self._c = constraint
     63 
     64 def add (self, constraint):
     65 	"""Add a constraint to this set."""
     66 	if not constraint:
     67 		return
     68 	first = constraint[0]
     69 	rest = constraint[1:]
     70 	if isinstance (self._c, list):
     71 		if constraint == self._c[:len (constraint)]:
     72 			self._c = constraint
     73 		elif self._c != constraint[:len (self._c)]:
     74 			self._c = {self._c[0]: ConstraintSet (self._c[1:])}
     75 	if isinstance (self._c, dict):
     76 		if first in self._c:
     77 			self._c[first].add (rest)
     78 		else:
     79 			self._c[first] = ConstraintSet (rest)
     80 
     81 @staticmethod
     82 def _indent (depth):
     83 	return ('  ' * depth).replace ('        ', '\t')
     84 
     85 def __str__ (self, index=0, depth=4):
     86 	s = []
     87 	indent = self._indent (depth)
     88 	if isinstance (self._c, list):
     89 		if len (self._c) == 0:
     90 			assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
     91 			s.append ('{}matched = true;\n'.format (indent))
     92 		elif len (self._c) == 1:
     93 			assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
     94 			s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
     95 		else:
     96 			s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
     97 			if index:
     98 				s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
     99 			for i, cp in enumerate (self._c[1:], start=1):
    100 				s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
    101 					self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
    102 			s.append ('{}{{\n'.format (indent))
    103 			for i in range (index):
    104 				s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
    105 			s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
    106 			s.append ('{}}}\n'.format (indent))
    107 	else:
    108 		s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
    109 		s.append ('{}{{\n'.format (indent))
    110 		cases = collections.defaultdict (set)
    111 		for first, rest in sorted (self._c.items ()):
    112 			cases[rest.__str__ (index + 1, depth + 2)].add (first)
    113 		for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
    114 			for i, cp in enumerate (sorted (labels)):
    115 				if i % 4 == 0:
    116 					s.append (self._indent (depth + 1))
    117 				else:
    118 					s.append (' ')
    119 				s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
    120 			if len (labels) % 4 != 0:
    121 				s.append ('\n')
    122 			s.append (body)
    123 			s.append ('{}break;\n'.format (self._indent (depth + 2)))
    124 		s.append ('{}}}\n'.format (indent))
    125 	return ''.join (s)
    126 
    127 constraints = {}
    128 with open (sys.argv[1], encoding='utf-8') as f:
    129 constraints_header = []
    130 while True:
    131 	line = f.readline ().strip ()
    132 	if line == '#':
    133 		break
    134 	constraints_header.append(line)
    135 for line in f:
    136 	j = line.find ('#')
    137 	if j >= 0:
    138 		line = line[:j]
    139 	constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
    140 	if not constraint: continue
    141 	assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
    142 	script = scripts[constraint[0]]
    143 	if script in constraints:
    144 		constraints[script].add (constraint)
    145 	else:
    146 		constraints[script] = ConstraintSet (constraint)
    147 	assert constraints, 'No constraints found'
    148 
    149 print ('/* == Start of generated functions == */')
    150 print ('/*')
    151 print (' * The following functions are generated by running:')
    152 print (' *')
    153 print (' *   %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
    154 print (' *')
    155 print (' * on files with these headers:')
    156 print (' *')
    157 for line in constraints_header:
    158 print (' * %s' % line.strip ())
    159 print (' *')
    160 for line in scripts_header:
    161 print (' * %s' % line.strip ())
    162 print (' */')
    163 
    164 print ()
    165 print ('#include "hb.hh"')
    166 print ()
    167 print ('#ifndef HB_NO_OT_SHAPE')
    168 print ()
    169 print ('#include "hb-ot-shaper-vowel-constraints.hh"')
    170 print ()
    171 print ('static void')
    172 print ('_output_dotted_circle (hb_buffer_t *buffer)')
    173 print ('{')
    174 print ('  (void) buffer->output_glyph (0x25CCu);')
    175 print ('  _hb_glyph_info_clear_continuation (&buffer->prev());')
    176 print ('}')
    177 print ()
    178 print ('static void')
    179 print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
    180 print ('{')
    181 print ('  _output_dotted_circle (buffer);')
    182 print ('  (void) buffer->next_glyph ();')
    183 print ('}')
    184 print ()
    185 
    186 print ('void')
    187 print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
    188 print ('\t\t\t\t       hb_buffer_t              *buffer,')
    189 print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
    190 print ('{')
    191 print ('#ifdef HB_NO_OT_SHAPER_VOWEL_CONSTRAINTS')
    192 print ('  return;')
    193 print ('#endif')
    194 print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
    195 print ('    return;')
    196 print ()
    197 print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
    198 print ('   * vowel-sequences that look like another vowel.  Data for each script')
    199 print ('   * collected from the USE script development spec.')
    200 print ('   *')
    201 print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
    202 print ('   */')
    203 print ('  buffer->clear_output ();')
    204 print ('  unsigned int count = buffer->len;')
    205 print ('  switch ((unsigned) buffer->props.script)')
    206 print ('  {')
    207 
    208 for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
    209 print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
    210 print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
    211 print ('      {')
    212 print ('\tbool matched = false;')
    213 write (str (constraints))
    214 print ('\t(void) buffer->next_glyph ();')
    215 print ('\tif (matched) _output_with_dotted_circle (buffer);')
    216 print ('      }')
    217 print ('      break;')
    218 print ()
    219 
    220 print ('    default:')
    221 print ('      break;')
    222 print ('  }')
    223 print ('  buffer->sync ();')
    224 print ('}')
    225 
    226 print ()
    227 print ()
    228 print ('#endif')
    229 print ('/* == End of generated functions == */')