tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

updateOperatorDictionary.pl (14702B)


      1 #!/usr/bin/perl
      2 # -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*-
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 use XML::LibXSLT;
      8 use XML::LibXML;
      9 use LWP::Simple;
     10 
     11 # output files
     12 $FILE_UNICODE = "unicode.xml";
     13 $FILE_DICTIONARY = "dictionary.xml";
     14 $FILE_DIFFERENCES = "differences.txt";
     15 $FILE_NEW_DICTIONARY = "new_dictionary.txt";
     16 $FILE_SYNTAX_ERRORS = "syntax_errors.txt";
     17 
     18 # our dictionary (property file)
     19 $MOZ_DICTIONARY = "mathfont.properties";
     20 
     21 # dictionary provided by the W3C in "XML Entity Definitions for Characters"
     22 $WG_DICTIONARY_URL = "https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml";
     23 
     24 # XSL stylesheet to extract relevant data from the dictionary
     25 $DICTIONARY_XSL = "operatorDictionary.xsl";
     26 
     27 # dictionary provided by the W3C transformed with operatorDictionary.xsl 
     28 $WG_DICTIONARY = $FILE_DICTIONARY;
     29 
     30 if (!($#ARGV >= 0 &&
     31      ((($ARGV[0] eq "download") && $#ARGV <= 1) ||
     32       (($ARGV[0] eq "compare") && $#ARGV <= 1) ||
     33       (($ARGV[0] eq "check") && $#ARGV <= 0) ||
     34       (($ARGV[0] eq "clean") && $#ARGV <= 0)))) {
     35    &usage;
     36 }
     37 
     38 if ($ARGV[0] eq "download") {
     39    if ($#ARGV == 1) {
     40        $WG_DICTIONARY_URL = $ARGV[1];
     41    }
     42    print "Downloading $WG_DICTIONARY_URL...\n";
     43    getstore($WG_DICTIONARY_URL, $FILE_UNICODE);
     44 
     45    print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n";
     46    my $xslt = XML::LibXSLT->new();
     47    my $source = XML::LibXML->load_xml(location => $FILE_UNICODE);
     48    my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL,
     49                                          no_cdata=>1);
     50    my $stylesheet = $xslt->parse_stylesheet($style_doc);
     51    my $results = $stylesheet->transform($source);
     52    open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!");
     53    print $file $stylesheet->output_as_bytes($results);
     54    close($file);
     55    exit 0;
     56 }
     57 
     58 if ($ARGV[0] eq "clean") {
     59    unlink($FILE_UNICODE,
     60           $FILE_DICTIONARY,
     61           $FILE_DIFFERENCES,
     62           $FILE_NEW_DICTIONARY,
     63           $FILE_SYNTAX_ERRORS);
     64    exit 0;
     65 }
     66 
     67 if ($ARGV[0] eq "compare" && $#ARGV == 1) {
     68    $WG_DICTIONARY = $ARGV[1];
     69 }
     70 
     71 ################################################################################
     72 # structure of the dictionary used by this script:
     73 # - key: same as in mathfont.properties
     74 # - table:
     75 #    index | value
     76 #      0   | description
     77 #      1   | lspace
     78 #      2   | rspace
     79 #      4   | largeop
     80 #      5   | movablelimits
     81 #      6   | stretchy
     82 #      7   | separator
     83 #      8   | accent
     84 #      9   | fence
     85 #     10   | symmetric
     86 #     13   | direction
     87 
     88 # 1) build %moz_hash from $MOZ_DICTIONARY
     89 
     90 print "loading $MOZ_DICTIONARY...\n";
     91 open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!");
     92 
     93 print "building dictionary...\n";
     94 while (<$file>) {
     95    next unless (m/^operator\.(.*)$/);
     96    (m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/);
     97 
     98    # 1.1) build the key
     99    $key = $1;
    100 
    101    # 1.2) build the array
    102    $_ = $2;
    103    @value = ();
    104    $value[0] = $3;
    105    if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; }
    106    if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; }
    107    $value[4] = (m/^(.*)largeop(.*)$/);
    108    $value[5] = (m/^(.*)movablelimits(.*)$/);
    109    $value[6] = (m/^(.*)stretchy(.*)$/);
    110    $value[7] = (m/^(.*)separator(.*)$/);
    111    $value[8] = (m/^(.*)accent(.*)$/);
    112    $value[9] = (m/^(.*)fence(.*)$/);
    113    $value[10] = (m/^(.*)symmetric(.*)$/);
    114    if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; }
    115    else { $value[13] = ""; }
    116 
    117    # 1.3) save the key and value
    118    $moz_hash{$key} = [ @value ];
    119 }
    120 
    121 close($file);
    122 
    123 ################################################################################
    124 # 2) If mode "check", verify validity of our operator dictionary and quit.
    125 #    If mode "compare", go to step 3)
    126 
    127 if ($ARGV[0] eq "check") {
    128    print "checking operator dictionary...\n";
    129    open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") ||
    130        die ("Couldn't open $FILE_SYNTAX_ERRORS!");
    131 
    132    $nb_errors = 0;
    133    $nb_warnings = 0;
    134    @moz_keys = (keys %moz_hash);
    135    # check the validity of our private data
    136    while ($key = pop(@moz_keys)) {
    137 
    138        if ($key =~ /\\u.+\\u.+\\u.+/) {
    139            $valid = 0;
    140            $nb_errors++;
    141            print $file_syntax_errors "error: \"$key\" has more than 2 characters\n";
    142        }
    143 
    144        if ($key =~ /\\u20D2\./ || $key =~ /\\u0338\./) {
    145            $valid = 0;
    146            $nb_errors++;
    147            print $file_syntax_errors "error: \"$key\" ends with character U+20D2 or U+0338\n";
    148        }
    149 
    150        @moz = @{ $moz_hash{$key} };
    151        $entry = &generateEntry($key, @moz);
    152        $valid = 1;
    153 
    154        if (!(@moz[13] eq "" ||
    155              @moz[13] eq "horizontal" ||
    156              @moz[13] eq "vertical")) {
    157            $valid = 0;
    158            $nb_errors++;
    159            print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n";
    160        }
    161 
    162        if (@moz[4] && !(@moz[13] eq "vertical")) {
    163            $valid = 0;
    164            $nb_errors++;
    165            print $file_syntax_errors "error: operator is largeop but does not have vertical direction\n";
    166        }
    167        
    168        if (!$valid) {
    169            print $file_syntax_errors $entry;
    170            print $file_syntax_errors "\n";
    171        }
    172    }
    173 
    174    # check that all forms have the same direction.
    175    @moz_keys = (keys %moz_hash);
    176    while ($key = pop(@moz_keys)) {
    177 
    178        if (@{ $moz_hash{$key} }) {
    179            # the operator has not been removed from the hash table yet.
    180 
    181            $_ = $key;
    182            (m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/);
    183            $key_prefix = "$1.prefix";
    184            $key_infix = "$1.infix";
    185            $key_postfix = "$1.postfix";
    186            @moz_prefix = @{ $moz_hash{$key_prefix} };
    187            @moz_infix = @{ $moz_hash{$key_infix} };
    188            @moz_postfix = @{ $moz_hash{$key_postfix} };
    189 
    190            $same_direction = 1;
    191 
    192            if (@moz_prefix) {
    193                if (@moz_infix &&
    194                    !($moz_infix[13] eq $moz_prefix[13])) {
    195                    $same_direction = 0;
    196                }
    197                if (@moz_postfix &&
    198                    !($moz_postfix[13] eq $moz_prefix[13])) {
    199                    $same_direction = 0;
    200                }
    201            }
    202            if (@moz_infix) {
    203                if (@moz_postfix &&
    204                    !($moz_postfix[13] eq $moz_infix[13])) {
    205                    $same_direction = 0;
    206                }
    207            }
    208 
    209            if (!$same_direction) {
    210                $nb_errors++;
    211                print  $file_syntax_errors
    212                    "error: operator has a stretchy form, but all forms";
    213                print  $file_syntax_errors
    214                    " have not the same direction\n";
    215                if (@moz_prefix) {
    216                    $_ = &generateEntry($key_prefix, @moz_prefix);
    217                    print $file_syntax_errors $_;
    218                }
    219                if (@moz_infix) {
    220                    $_ = &generateEntry($key_infix, @moz_infix);
    221                    print $file_syntax_errors $_;
    222                }
    223                if (@moz_postfix) {
    224                    $_ = &generateEntry($key_postfix, @moz_postfix);
    225                    print $file_syntax_errors $_;
    226                }
    227                print $file_syntax_errors "\n";
    228            }
    229            
    230            if (@moz_prefix) {
    231                delete $moz_hash{$key.prefix};
    232            }
    233            if (@moz_infix) {
    234                delete $moz_hash{$key_infix};
    235            }
    236            if (@moz_postfix) {
    237                delete $moz_hash{$key_postfix};
    238            }
    239        }
    240    }
    241 
    242    close($file_syntax_errors);
    243    print "\n";
    244    if ($nb_errors > 0 || $nb_warnings > 0) {
    245        print "$nb_errors error(s) found\n";
    246        print "$nb_warnings warning(s) found\n";
    247        print "See output file $FILE_SYNTAX_ERRORS.\n\n";
    248    } else {
    249        print "No error found.\n\n";
    250    }
    251 
    252    exit 0;
    253 }
    254 
    255 ################################################################################
    256 # 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY
    257 
    258 print "loading $WG_DICTIONARY...\n";
    259 my $parser = XML::LibXML->new();
    260 my $doc = $parser->parse_file($WG_DICTIONARY);
    261 
    262 print "building dictionary...\n";
    263 @wg_keys = ();
    264 
    265 foreach my $entry ($doc->findnodes('/root/entry')) {
    266    # 3.1) build the key
    267    $key = "operator.";
    268 
    269    $_ = $entry->getAttribute("unicode");
    270 
    271    # Skip non-BMP Arabic characters that are handled specially.
    272    if ($_ == "U1EEF0" || $_ == "U1EEF1") {
    273        next;
    274    }
    275 
    276    $_ = "$_-";
    277    while (m/^U?0(\w*)-(.*)$/) {
    278        # Concatenate .\uNNNN
    279        $key = "$key\\u$1";
    280        $_ = $2;
    281    }
    282 
    283    $_ = $entry->getAttribute("form"); # "Form"
    284    $key = "$key.$_";
    285 
    286    # 3.2) build the array
    287    @value = ();
    288    $value[0] = lc($entry->getAttribute("description"));
    289    $value[1] = $entry->getAttribute("lspace");
    290    if ($value[1] eq "") { $value[1] = "5"; }
    291    $value[2] = $entry->getAttribute("rspace");
    292    if ($value[2] eq "") { $value[2] = "5"; }
    293 
    294    $_ = $entry->getAttribute("properties");
    295    $value[4] = (m/^(.*)largeop(.*)$/);
    296    $value[5] = (m/^(.*)movablelimits(.*)$/);
    297    $value[6] = (m/^(.*)stretchy(.*)$/);
    298    $value[7] = (m/^(.*)separator(.*)$/);
    299    $value[9] = (m/^(.*)fence(.*)$/);
    300    $value[10] = (m/^(.*)symmetric(.*)$/);
    301 
    302    # not stored in the WG dictionary
    303    $value[8] = ""; # accent
    304    $value[13] = ""; # direction
    305 
    306    # 3.3) save the key and value
    307    push(@wg_keys, $key);
    308    $wg_hash{$key} = [ @value ];
    309 }
    310 @wg_keys = reverse(@wg_keys);
    311 
    312 ################################################################################
    313 # 4) Compare the two dictionaries and output the result
    314 
    315 print "comparing dictionaries...\n";
    316 open($file_differences, ">$FILE_DIFFERENCES") ||
    317    die ("Couldn't open $FILE_DIFFERENCES!");
    318 open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") ||
    319    die ("Couldn't open $FILE_NEW_DICTIONARY!");
    320 
    321 $conflicting = 0; $conflicting_stretching = 0;
    322 $new = 0; $new_stretching = 0;
    323 $obsolete = 0; $obsolete_stretching = 0;
    324 $unchanged = 0;
    325 
    326 # 4.1) look to the entries of the WG dictionary
    327 while ($key = pop(@wg_keys)) {
    328 
    329    @wg = @{ $wg_hash{$key} };
    330    delete $wg_hash{$key};
    331    $wg_value = &generateCommon(@wg);
    332 
    333    if (exists($moz_hash{$key})) {
    334        # entry is in both dictionary
    335        @moz = @{ $moz_hash{$key} };
    336        delete $moz_hash{$key};
    337        $moz_value = &generateCommon(@moz);
    338        if ($moz_value ne $wg_value) {
    339            # conflicting entry
    340            print $file_differences "[conflict]";
    341            $conflicting++;
    342            if ($moz[6] != $wg[6]) {
    343                print $file_differences "[stretching]";
    344                $conflicting_stretching++;
    345            }
    346            print $file_differences " - $key ($wg[0])\n";
    347            print $file_differences "-$moz_value\n+$wg_value\n\n";
    348            $_ = &completeCommon($wg_value, $key, @moz, @wg);
    349            print $file_new_dictionary $_;
    350        } else {
    351            # unchanged entry
    352            $unchanged++;
    353            $_ = &completeCommon($wg_value, $key, @moz, @wg);
    354            print $file_new_dictionary $_;
    355        }
    356    } else {
    357        # we don't have this entry in our dictionary yet
    358        print $file_differences "[new entry]";
    359        $new++;
    360        if ($wg[6]) {
    361            print $file_differences "[stretching]";
    362            $new_stretching++;
    363        }
    364        print $file_differences " - $key ($wg[0])\n";
    365        print $file_differences "-\n+$wg_value\n\n";
    366        $_ = &completeCommon($wg_value, $key, (), @wg);
    367        print $file_new_dictionary $_;
    368    }
    369 }
    370 
    371 print $file_new_dictionary
    372    "\n# Entries below are not part of the official MathML dictionary\n\n";
    373 # 4.2) look in our dictionary the remaining entries
    374 @moz_keys = (keys %moz_hash);
    375 @moz_keys = reverse(sort(@moz_keys));
    376 
    377 while ($key = pop(@moz_keys)) {
    378    @moz = @{ $moz_hash{$key} };
    379    $moz_value = &generateCommon(@moz);
    380    print $file_differences "[obsolete entry]";
    381    $obsolete++;
    382    if ($moz[6]) {
    383        print $file_differences "[stretching]";
    384        $obsolete_stretching++;
    385    }
    386    print $file_differences " - $key ($moz[0])\n";
    387    print $file_differences "-$moz_value\n+\n\n";
    388    $_ = &completeCommon($moz_value, $key, (), @moz);
    389    print $file_new_dictionary $_;
    390 }
    391 
    392 close($file_differences);
    393 close($file_new_dictionary);
    394 
    395 print "\n";
    396 print "- $obsolete obsolete entries ";
    397 print "($obsolete_stretching of them are related to stretching)\n";
    398 print "- $unchanged unchanged entries\n";
    399 print "- $conflicting conflicting entries ";
    400 print "($conflicting_stretching of them are related to stretching)\n";
    401 print "- $new new entries ";
    402 print "($new_stretching of them are related to stretching)\n";
    403 print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n";
    404 print "After having modified the dictionary, please run";
    405 print "./updateOperatorDictionary check\n\n";
    406 exit 0;
    407 
    408 ################################################################################
    409 sub usage {
    410    # display the accepted command syntax and quit
    411    print "usage:\n";
    412    print "  ./updateOperatorDictionary.pl download [unicode.xml]\n";
    413    print "  ./updateOperatorDictionary.pl compare [dictionary.xml]\n";
    414    print "  ./updateOperatorDictionary.pl check\n";
    415    print "  ./updateOperatorDictionary.pl clean\n";
    416    exit 0;
    417 }
    418 
    419 sub generateCommon {
    420    # helper function to generate the string of data shared by both dictionaries
    421    my(@v) = @_;
    422    $entry = "lspace:$v[1] rspace:$v[2]";
    423    if ($v[4]) { $entry = "$entry largeop"; }
    424    if ($v[5]) { $entry = "$entry movablelimits"; }
    425    if ($v[6]) { $entry = "$entry stretchy"; }
    426    if ($v[7]) { $entry = "$entry separator"; }
    427    if ($v[9]) { $entry = "$entry fence"; }
    428    if ($v[10]) { $entry = "$entry symmetric"; }
    429    return $entry;
    430 }
    431 
    432 sub completeCommon {
    433    # helper to add key and private data to generateCommon
    434    my($entry, $key, @v_moz, @v_wg) = @_;
    435    
    436    $entry = "$key = $entry";
    437 
    438    if ($v_moz[8]) { $entry = "$entry accent"; }
    439    if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; }
    440 
    441    if ($v_moz[0]) {
    442        # keep our previous comment
    443        $entry = "$entry # $v_moz[0]";
    444    } else {
    445        # otherwise use the description given by the WG
    446        $entry = "$entry # $v_wg[0]";
    447    }
    448 
    449    $entry = "$entry\n";
    450    return $entry;
    451 }
    452 
    453 sub generateEntry {
    454    # helper function to generate an entry of our operator dictionary
    455    my($key, @moz) = @_;
    456    $entry = &generateCommon(@moz);
    457    $entry = &completeCommon($entry, $key, @moz, @moz);
    458    return $entry;
    459 }