updateOperatorDictionary.pl (14702B)
1 #!/usr/bin/perl 2 # -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*- 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 use XML::LibXSLT; 8 use XML::LibXML; 9 use LWP::Simple; 10 11 # output files 12 $FILE_UNICODE = "unicode.xml"; 13 $FILE_DICTIONARY = "dictionary.xml"; 14 $FILE_DIFFERENCES = "differences.txt"; 15 $FILE_NEW_DICTIONARY = "new_dictionary.txt"; 16 $FILE_SYNTAX_ERRORS = "syntax_errors.txt"; 17 18 # our dictionary (property file) 19 $MOZ_DICTIONARY = "mathfont.properties"; 20 21 # dictionary provided by the W3C in "XML Entity Definitions for Characters" 22 $WG_DICTIONARY_URL = "https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml"; 23 24 # XSL stylesheet to extract relevant data from the dictionary 25 $DICTIONARY_XSL = "operatorDictionary.xsl"; 26 27 # dictionary provided by the W3C transformed with operatorDictionary.xsl 28 $WG_DICTIONARY = $FILE_DICTIONARY; 29 30 if (!($#ARGV >= 0 && 31 ((($ARGV[0] eq "download") && $#ARGV <= 1) || 32 (($ARGV[0] eq "compare") && $#ARGV <= 1) || 33 (($ARGV[0] eq "check") && $#ARGV <= 0) || 34 (($ARGV[0] eq "clean") && $#ARGV <= 0)))) { 35 &usage; 36 } 37 38 if ($ARGV[0] eq "download") { 39 if ($#ARGV == 1) { 40 $WG_DICTIONARY_URL = $ARGV[1]; 41 } 42 print "Downloading $WG_DICTIONARY_URL...\n"; 43 getstore($WG_DICTIONARY_URL, $FILE_UNICODE); 44 45 print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n"; 46 my $xslt = XML::LibXSLT->new(); 47 my $source = XML::LibXML->load_xml(location => $FILE_UNICODE); 48 my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL, 49 no_cdata=>1); 50 my $stylesheet = $xslt->parse_stylesheet($style_doc); 51 my $results = $stylesheet->transform($source); 52 open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!"); 53 print $file $stylesheet->output_as_bytes($results); 54 close($file); 55 exit 0; 56 } 57 58 if ($ARGV[0] eq "clean") { 59 unlink($FILE_UNICODE, 60 $FILE_DICTIONARY, 61 $FILE_DIFFERENCES, 62 $FILE_NEW_DICTIONARY, 63 $FILE_SYNTAX_ERRORS); 64 exit 0; 65 } 66 67 if ($ARGV[0] eq "compare" && $#ARGV == 1) { 68 $WG_DICTIONARY = $ARGV[1]; 69 } 70 71 ################################################################################ 72 # structure of the dictionary used by this script: 73 # - key: same as in mathfont.properties 74 # - table: 75 # index | value 76 # 0 | description 77 # 1 | lspace 78 # 2 | rspace 79 # 4 | largeop 80 # 5 | movablelimits 81 # 6 | stretchy 82 # 7 | separator 83 # 8 | accent 84 # 9 | fence 85 # 10 | symmetric 86 # 13 | direction 87 88 # 1) build %moz_hash from $MOZ_DICTIONARY 89 90 print "loading $MOZ_DICTIONARY...\n"; 91 open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!"); 92 93 print "building dictionary...\n"; 94 while (<$file>) { 95 next unless (m/^operator\.(.*)$/); 96 (m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/); 97 98 # 1.1) build the key 99 $key = $1; 100 101 # 1.2) build the array 102 $_ = $2; 103 @value = (); 104 $value[0] = $3; 105 if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; } 106 if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; } 107 $value[4] = (m/^(.*)largeop(.*)$/); 108 $value[5] = (m/^(.*)movablelimits(.*)$/); 109 $value[6] = (m/^(.*)stretchy(.*)$/); 110 $value[7] = (m/^(.*)separator(.*)$/); 111 $value[8] = (m/^(.*)accent(.*)$/); 112 $value[9] = (m/^(.*)fence(.*)$/); 113 $value[10] = (m/^(.*)symmetric(.*)$/); 114 if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; } 115 else { $value[13] = ""; } 116 117 # 1.3) save the key and value 118 $moz_hash{$key} = [ @value ]; 119 } 120 121 close($file); 122 123 ################################################################################ 124 # 2) If mode "check", verify validity of our operator dictionary and quit. 125 # If mode "compare", go to step 3) 126 127 if ($ARGV[0] eq "check") { 128 print "checking operator dictionary...\n"; 129 open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") || 130 die ("Couldn't open $FILE_SYNTAX_ERRORS!"); 131 132 $nb_errors = 0; 133 $nb_warnings = 0; 134 @moz_keys = (keys %moz_hash); 135 # check the validity of our private data 136 while ($key = pop(@moz_keys)) { 137 138 if ($key =~ /\\u.+\\u.+\\u.+/) { 139 $valid = 0; 140 $nb_errors++; 141 print $file_syntax_errors "error: \"$key\" has more than 2 characters\n"; 142 } 143 144 if ($key =~ /\\u20D2\./ || $key =~ /\\u0338\./) { 145 $valid = 0; 146 $nb_errors++; 147 print $file_syntax_errors "error: \"$key\" ends with character U+20D2 or U+0338\n"; 148 } 149 150 @moz = @{ $moz_hash{$key} }; 151 $entry = &generateEntry($key, @moz); 152 $valid = 1; 153 154 if (!(@moz[13] eq "" || 155 @moz[13] eq "horizontal" || 156 @moz[13] eq "vertical")) { 157 $valid = 0; 158 $nb_errors++; 159 print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n"; 160 } 161 162 if (@moz[4] && !(@moz[13] eq "vertical")) { 163 $valid = 0; 164 $nb_errors++; 165 print $file_syntax_errors "error: operator is largeop but does not have vertical direction\n"; 166 } 167 168 if (!$valid) { 169 print $file_syntax_errors $entry; 170 print $file_syntax_errors "\n"; 171 } 172 } 173 174 # check that all forms have the same direction. 175 @moz_keys = (keys %moz_hash); 176 while ($key = pop(@moz_keys)) { 177 178 if (@{ $moz_hash{$key} }) { 179 # the operator has not been removed from the hash table yet. 180 181 $_ = $key; 182 (m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/); 183 $key_prefix = "$1.prefix"; 184 $key_infix = "$1.infix"; 185 $key_postfix = "$1.postfix"; 186 @moz_prefix = @{ $moz_hash{$key_prefix} }; 187 @moz_infix = @{ $moz_hash{$key_infix} }; 188 @moz_postfix = @{ $moz_hash{$key_postfix} }; 189 190 $same_direction = 1; 191 192 if (@moz_prefix) { 193 if (@moz_infix && 194 !($moz_infix[13] eq $moz_prefix[13])) { 195 $same_direction = 0; 196 } 197 if (@moz_postfix && 198 !($moz_postfix[13] eq $moz_prefix[13])) { 199 $same_direction = 0; 200 } 201 } 202 if (@moz_infix) { 203 if (@moz_postfix && 204 !($moz_postfix[13] eq $moz_infix[13])) { 205 $same_direction = 0; 206 } 207 } 208 209 if (!$same_direction) { 210 $nb_errors++; 211 print $file_syntax_errors 212 "error: operator has a stretchy form, but all forms"; 213 print $file_syntax_errors 214 " have not the same direction\n"; 215 if (@moz_prefix) { 216 $_ = &generateEntry($key_prefix, @moz_prefix); 217 print $file_syntax_errors $_; 218 } 219 if (@moz_infix) { 220 $_ = &generateEntry($key_infix, @moz_infix); 221 print $file_syntax_errors $_; 222 } 223 if (@moz_postfix) { 224 $_ = &generateEntry($key_postfix, @moz_postfix); 225 print $file_syntax_errors $_; 226 } 227 print $file_syntax_errors "\n"; 228 } 229 230 if (@moz_prefix) { 231 delete $moz_hash{$key.prefix}; 232 } 233 if (@moz_infix) { 234 delete $moz_hash{$key_infix}; 235 } 236 if (@moz_postfix) { 237 delete $moz_hash{$key_postfix}; 238 } 239 } 240 } 241 242 close($file_syntax_errors); 243 print "\n"; 244 if ($nb_errors > 0 || $nb_warnings > 0) { 245 print "$nb_errors error(s) found\n"; 246 print "$nb_warnings warning(s) found\n"; 247 print "See output file $FILE_SYNTAX_ERRORS.\n\n"; 248 } else { 249 print "No error found.\n\n"; 250 } 251 252 exit 0; 253 } 254 255 ################################################################################ 256 # 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY 257 258 print "loading $WG_DICTIONARY...\n"; 259 my $parser = XML::LibXML->new(); 260 my $doc = $parser->parse_file($WG_DICTIONARY); 261 262 print "building dictionary...\n"; 263 @wg_keys = (); 264 265 foreach my $entry ($doc->findnodes('/root/entry')) { 266 # 3.1) build the key 267 $key = "operator."; 268 269 $_ = $entry->getAttribute("unicode"); 270 271 # Skip non-BMP Arabic characters that are handled specially. 272 if ($_ == "U1EEF0" || $_ == "U1EEF1") { 273 next; 274 } 275 276 $_ = "$_-"; 277 while (m/^U?0(\w*)-(.*)$/) { 278 # Concatenate .\uNNNN 279 $key = "$key\\u$1"; 280 $_ = $2; 281 } 282 283 $_ = $entry->getAttribute("form"); # "Form" 284 $key = "$key.$_"; 285 286 # 3.2) build the array 287 @value = (); 288 $value[0] = lc($entry->getAttribute("description")); 289 $value[1] = $entry->getAttribute("lspace"); 290 if ($value[1] eq "") { $value[1] = "5"; } 291 $value[2] = $entry->getAttribute("rspace"); 292 if ($value[2] eq "") { $value[2] = "5"; } 293 294 $_ = $entry->getAttribute("properties"); 295 $value[4] = (m/^(.*)largeop(.*)$/); 296 $value[5] = (m/^(.*)movablelimits(.*)$/); 297 $value[6] = (m/^(.*)stretchy(.*)$/); 298 $value[7] = (m/^(.*)separator(.*)$/); 299 $value[9] = (m/^(.*)fence(.*)$/); 300 $value[10] = (m/^(.*)symmetric(.*)$/); 301 302 # not stored in the WG dictionary 303 $value[8] = ""; # accent 304 $value[13] = ""; # direction 305 306 # 3.3) save the key and value 307 push(@wg_keys, $key); 308 $wg_hash{$key} = [ @value ]; 309 } 310 @wg_keys = reverse(@wg_keys); 311 312 ################################################################################ 313 # 4) Compare the two dictionaries and output the result 314 315 print "comparing dictionaries...\n"; 316 open($file_differences, ">$FILE_DIFFERENCES") || 317 die ("Couldn't open $FILE_DIFFERENCES!"); 318 open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") || 319 die ("Couldn't open $FILE_NEW_DICTIONARY!"); 320 321 $conflicting = 0; $conflicting_stretching = 0; 322 $new = 0; $new_stretching = 0; 323 $obsolete = 0; $obsolete_stretching = 0; 324 $unchanged = 0; 325 326 # 4.1) look to the entries of the WG dictionary 327 while ($key = pop(@wg_keys)) { 328 329 @wg = @{ $wg_hash{$key} }; 330 delete $wg_hash{$key}; 331 $wg_value = &generateCommon(@wg); 332 333 if (exists($moz_hash{$key})) { 334 # entry is in both dictionary 335 @moz = @{ $moz_hash{$key} }; 336 delete $moz_hash{$key}; 337 $moz_value = &generateCommon(@moz); 338 if ($moz_value ne $wg_value) { 339 # conflicting entry 340 print $file_differences "[conflict]"; 341 $conflicting++; 342 if ($moz[6] != $wg[6]) { 343 print $file_differences "[stretching]"; 344 $conflicting_stretching++; 345 } 346 print $file_differences " - $key ($wg[0])\n"; 347 print $file_differences "-$moz_value\n+$wg_value\n\n"; 348 $_ = &completeCommon($wg_value, $key, @moz, @wg); 349 print $file_new_dictionary $_; 350 } else { 351 # unchanged entry 352 $unchanged++; 353 $_ = &completeCommon($wg_value, $key, @moz, @wg); 354 print $file_new_dictionary $_; 355 } 356 } else { 357 # we don't have this entry in our dictionary yet 358 print $file_differences "[new entry]"; 359 $new++; 360 if ($wg[6]) { 361 print $file_differences "[stretching]"; 362 $new_stretching++; 363 } 364 print $file_differences " - $key ($wg[0])\n"; 365 print $file_differences "-\n+$wg_value\n\n"; 366 $_ = &completeCommon($wg_value, $key, (), @wg); 367 print $file_new_dictionary $_; 368 } 369 } 370 371 print $file_new_dictionary 372 "\n# Entries below are not part of the official MathML dictionary\n\n"; 373 # 4.2) look in our dictionary the remaining entries 374 @moz_keys = (keys %moz_hash); 375 @moz_keys = reverse(sort(@moz_keys)); 376 377 while ($key = pop(@moz_keys)) { 378 @moz = @{ $moz_hash{$key} }; 379 $moz_value = &generateCommon(@moz); 380 print $file_differences "[obsolete entry]"; 381 $obsolete++; 382 if ($moz[6]) { 383 print $file_differences "[stretching]"; 384 $obsolete_stretching++; 385 } 386 print $file_differences " - $key ($moz[0])\n"; 387 print $file_differences "-$moz_value\n+\n\n"; 388 $_ = &completeCommon($moz_value, $key, (), @moz); 389 print $file_new_dictionary $_; 390 } 391 392 close($file_differences); 393 close($file_new_dictionary); 394 395 print "\n"; 396 print "- $obsolete obsolete entries "; 397 print "($obsolete_stretching of them are related to stretching)\n"; 398 print "- $unchanged unchanged entries\n"; 399 print "- $conflicting conflicting entries "; 400 print "($conflicting_stretching of them are related to stretching)\n"; 401 print "- $new new entries "; 402 print "($new_stretching of them are related to stretching)\n"; 403 print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n"; 404 print "After having modified the dictionary, please run"; 405 print "./updateOperatorDictionary check\n\n"; 406 exit 0; 407 408 ################################################################################ 409 sub usage { 410 # display the accepted command syntax and quit 411 print "usage:\n"; 412 print " ./updateOperatorDictionary.pl download [unicode.xml]\n"; 413 print " ./updateOperatorDictionary.pl compare [dictionary.xml]\n"; 414 print " ./updateOperatorDictionary.pl check\n"; 415 print " ./updateOperatorDictionary.pl clean\n"; 416 exit 0; 417 } 418 419 sub generateCommon { 420 # helper function to generate the string of data shared by both dictionaries 421 my(@v) = @_; 422 $entry = "lspace:$v[1] rspace:$v[2]"; 423 if ($v[4]) { $entry = "$entry largeop"; } 424 if ($v[5]) { $entry = "$entry movablelimits"; } 425 if ($v[6]) { $entry = "$entry stretchy"; } 426 if ($v[7]) { $entry = "$entry separator"; } 427 if ($v[9]) { $entry = "$entry fence"; } 428 if ($v[10]) { $entry = "$entry symmetric"; } 429 return $entry; 430 } 431 432 sub completeCommon { 433 # helper to add key and private data to generateCommon 434 my($entry, $key, @v_moz, @v_wg) = @_; 435 436 $entry = "$key = $entry"; 437 438 if ($v_moz[8]) { $entry = "$entry accent"; } 439 if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; } 440 441 if ($v_moz[0]) { 442 # keep our previous comment 443 $entry = "$entry # $v_moz[0]"; 444 } else { 445 # otherwise use the description given by the WG 446 $entry = "$entry # $v_wg[0]"; 447 } 448 449 $entry = "$entry\n"; 450 return $entry; 451 } 452 453 sub generateEntry { 454 # helper function to generate an entry of our operator dictionary 455 my($key, @moz) = @_; 456 $entry = &generateCommon(@moz); 457 $entry = &completeCommon($entry, $key, @moz, @moz); 458 return $entry; 459 }