tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

genUnicodePropertyData.pl (11144B)


      1 #!/usr/bin/env perl
      2 
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 # This tool is used to prepare lookup tables of Unicode character properties
      8 # needed by gfx code to support text shaping operations. The properties are
      9 # read from the Unicode Character Database and compiled into multi-level arrays
     10 # for efficient lookup.
     11 #
     12 # Note that for most properties, we now rely on ICU; this tool and the tables
     13 # it generates are used only for a couple of properties not readily exposed
     14 # via ICU APIs.
     15 #
     16 # To regenerate the tables in nsUnicodePropertyData.cpp:
     17 #
     18 # (1) Download the current Unicode data files from
     19 #
     20 #         https://www.unicode.org/Public/UNIDATA/
     21 #
     22 #     NB: not all the files are actually needed; currently, we require
     23 #       - UnicodeData.txt
     24 #       - ReadMe.txt (to record version/date of the UCD)
     25 #       - Unihan_Variants.txt (from Unihan.zip)
     26 #     though this may change if we find a need for additional properties.
     27 #
     28 #     The Unicode data files listed above should be together in one directory.
     29 #
     30 #
     31 # (2) Run this tool using a command line of the form
     32 #
     33 #         perl genUnicodePropertyData.pl      \
     34 #                 /path/to/icu/common/unicode \
     35 #                 /path/to/UCD-directory
     36 #
     37 #     This will generate (or overwrite!) the files
     38 #
     39 #         nsUnicodePropertyData.cpp
     40 #         UnicodeScriptCodes.h
     41 #
     42 #     in the current directory.
     43 
     44 use strict;
     45 use List::Util qw(first);
     46 
     47 if ($#ARGV != 1) {
     48    print <<__EOT;
     49 # Run this tool using a command line of the form
     50 #
     51 #     perl genUnicodePropertyData.pl      \\
     52 #             /path/to/icu/common/unicode \\
     53 #             /path/to/UCD-directory
     54 #
     55 # where icu/common/unicode is the directory containing ICU 'common' headers,
     56 # and UCD-directory is a directory containing the current Unicode Character
     57 # Database files (UnicodeData.txt, etc), available from
     58 # https://www.unicode.org/Public/UNIDATA/, with additional resources as
     59 # detailed in the source comments.
     60 #
     61 # This will generate (or overwrite!) the files
     62 #
     63 #     nsUnicodePropertyData.cpp
     64 #     UnicodeScriptCodes.h
     65 #
     66 # in the current directory.
     67 __EOT
     68    exit 0;
     69 }
     70 
     71 my $ICU = $ARGV[0];
     72 my $UNICODE = $ARGV[1];
     73 
     74 my @scriptCodeToName;
     75 
     76 my $sc = -1;
     77 
     78 sub readIcuHeader
     79 {
     80    my $file = shift;
     81    open FH, "< $ICU/$file" or die "can't open ICU header $ICU/$file\n";
     82    while (<FH>) {
     83        # adjust for ICU vs UCD naming discrepancies
     84        s/LANNA/TAI_THAM/;
     85        s/MEITEI_MAYEK/MEETEI_MAYEK/;
     86        s/ORKHON/OLD_TURKIC/;
     87        s/MENDE/MENDE_KIKAKUI/;
     88        s/SIGN_WRITING/SIGNWRITING/;
     89        if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) {
     90            $sc = $2;
     91            $scriptCodeToName[$sc] = $1;
     92        }
     93    }
     94    close FH;
     95 }
     96 
     97 &readIcuHeader("uscript.h");
     98 
     99 die "didn't find ICU script codes\n" if $sc == -1;
    100 
    101 # initialize default properties
    102 my @hanVariant;
    103 my @fullWidth;
    104 my @fullWidthInverse;
    105 for (my $i = 0; $i < 0x110000; ++$i) {
    106    $hanVariant[$i] = 0;
    107    $fullWidth[$i] = 0;
    108    $fullWidthInverse[$i] = 0;
    109 }
    110 
    111 # read ReadMe.txt
    112 my @versionInfo;
    113 open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n";
    114 while (<FH>) {
    115    chomp;
    116    push @versionInfo, $_;
    117 }
    118 close FH;
    119 
    120 # read UnicodeData.txt
    121 open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n";
    122 while (<FH>) {
    123    chomp;
    124    my @fields = split /;/;
    125    if ($fields[1] =~ /First/) {
    126        my $first = hex "0x$fields[0]";
    127        $_ = <FH>;
    128        @fields = split /;/;
    129        if ($fields[1] =~ /Last/) {
    130            my $last = hex "0x$fields[0]";
    131            do {
    132                if ($fields[1] =~ /CJK/) {
    133                  @hanVariant[$first] = 3;
    134                }
    135                $first++;
    136            } while ($first <= $last);
    137        } else {
    138            die "didn't find Last code for range!\n";
    139        }
    140    } else {
    141        my $usv = hex "0x$fields[0]";
    142        if ($fields[1] =~ /CJK/) {
    143          @hanVariant[$usv] = 3;
    144        }
    145        if ($fields[5] =~ /^<narrow>/) {
    146          my $wideChar = hex(substr($fields[5], 9));
    147          die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff;
    148          $fullWidth[$usv] = $wideChar;
    149          $fullWidthInverse[$wideChar] = $usv;
    150        }
    151        elsif ($fields[5] =~ /^<wide>/) {
    152          my $narrowChar = hex(substr($fields[5], 7));
    153          die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff;
    154          $fullWidth[$narrowChar] = $usv;
    155          $fullWidthInverse[$usv] = $narrowChar;
    156        }
    157    }
    158 }
    159 close FH;
    160 
    161 open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n";
    162 push @versionInfo, "";
    163 while (<FH>) {
    164  chomp;
    165  push @versionInfo, $_;
    166  last if /Date:/;
    167 }
    168 my $savedusv = 0;
    169 my $hasTC = 0;
    170 my $hasSC = 0;
    171 while (<FH>) {
    172  chomp;
    173  if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) {
    174    my $usv = hex "0x$1";
    175    if ($usv != $savedusv) {
    176      unless ($savedusv == 0) {
    177        if ($hasTC && !$hasSC) {
    178          $hanVariant[$savedusv] = 1;
    179        } elsif (!$hasTC && $hasSC) {
    180          $hanVariant[$savedusv] = 2;
    181        }
    182      }
    183      $savedusv = $usv;
    184      $hasTC = 0;
    185      $hasSC = 0;
    186    }
    187    if ($2 eq "Traditional") {
    188      $hasTC = 1;
    189    }
    190    if ($2 eq "Simplified") {
    191      $hasSC = 1;
    192    }
    193  } 
    194 }
    195 close FH;
    196 
    197 my $timestamp = gmtime();
    198 
    199 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output";
    200 
    201 my $licenseBlock = q[/* This Source Code Form is subject to the terms of the Mozilla Public
    202 * License, v. 2.0. If a copy of the MPL was not distributed with this
    203 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
    204 
    205 /*
    206 * Derived from the Unicode Character Database by genUnicodePropertyData.pl
    207 *
    208 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html
    209 */
    210 ];
    211 
    212 my $versionInfo = join("\n", @versionInfo);
    213 
    214 print DATA_TABLES <<__END;
    215 $licenseBlock
    216 /*
    217 * Created on $timestamp from UCD data files with version info:
    218 *
    219 
    220 $versionInfo
    221 
    222 *
    223 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
    224 */
    225 
    226 #include <stdint.h>
    227 #include "harfbuzz/hb.h"
    228 
    229 __END
    230 
    231 open HEADER, "> UnicodeScriptCodes.h" or die "unable to open UnicodeScriptCodes.h for output";
    232 
    233 print HEADER <<__END;
    234 $licenseBlock
    235 /*
    236 * Created on $timestamp from UCD data files with version info:
    237 *
    238 
    239 $versionInfo
    240 
    241 *
    242 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
    243 */
    244 
    245 #ifndef intl_components_UnicodeScriptCodes_h_
    246 #define intl_components_UnicodeScriptCodes_h_
    247 
    248 __END
    249 
    250 our $totalData = 0;
    251 
    252 sub sprintHanVariants
    253 {
    254  my $baseUsv = shift;
    255  my $varShift = 0;
    256  my $val = 0;
    257  while ($varShift < 8) {
    258    $val |= $hanVariant[$baseUsv++] << $varShift;
    259    $varShift += 2;
    260  }
    261  return sprintf("0x%02x,", $val);
    262 }
    263 ## Han Variant data currently unused but may be needed in future, see bug 857481
    264 ## &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4);
    265 
    266 sub sprintFullWidth
    267 {
    268  my $usv = shift;
    269  return sprintf("0x%04x,", $fullWidth[$usv]);
    270 }
    271 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1);
    272 
    273 sub sprintFullWidthInverse
    274 {
    275  my $usv = shift;
    276  return sprintf("0x%04x,", $fullWidthInverse[$usv]);
    277 }
    278 &genTables("FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1);
    279 
    280 print STDERR "Total data = $totalData\n";
    281 
    282 sub genTables
    283 {
    284  my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_;
    285 
    286  if ($typedef ne '') {
    287    print HEADER "$typedef\n";
    288  }
    289 
    290  print DATA_TABLES "#define k${prefix}MaxPlane  $maxPlane\n";
    291  print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n";
    292  print DATA_TABLES "#define k${prefix}CharBits  $charBits\n";
    293 
    294  my $indexLen = 1 << $indexBits;
    295  my $charsPerPage = 1 << $charBits;
    296  my %charIndex = ();
    297  my %pageMapIndex = ();
    298  my @pageMap = ();
    299  my @char = ();
    300  
    301  my $planeMap = "\x00" x $maxPlane;
    302  foreach my $plane (0 .. $maxPlane) {
    303    my $pageMap = "\x00" x $indexLen * 2;
    304    foreach my $page (0 .. $indexLen - 1) {
    305        my $charValues = "";
    306        for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) {
    307            my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch;
    308            $charValues .= &$func($usv);
    309        }
    310        chop $charValues;
    311 
    312        unless (exists $charIndex{$charValues}) {
    313            $charIndex{$charValues} = scalar keys %charIndex;
    314            $char[$charIndex{$charValues}] = $charValues;
    315        }
    316        substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues});
    317    }
    318    
    319    unless (exists $pageMapIndex{$pageMap}) {
    320        $pageMapIndex{$pageMap} = scalar keys %pageMapIndex;
    321        $pageMap[$pageMapIndex{$pageMap}] = $pageMap;
    322    }
    323    if ($plane > 0) {
    324        substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap});
    325    }
    326  }
    327 
    328  if ($maxPlane) {
    329    print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {";
    330    print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap));
    331    print DATA_TABLES "};\n\n";
    332  }
    333 
    334  my $chCount = scalar @char;
    335  my $pmBits = $chCount > 255 ? 16 : 8;
    336  my $pmCount = scalar @pageMap;
    337  if ($maxPlane == 0) {
    338    die "there should only be one pageMap entry!" if $pmCount > 1;
    339    print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n";
    340  } else {
    341    print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n";
    342  }
    343  for (my $i = 0; $i < scalar @pageMap; ++$i) {
    344    print DATA_TABLES $maxPlane > 0 ? "  {" : "  ";
    345    print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i]));
    346    print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n";
    347  }
    348  print DATA_TABLES "};\n\n";
    349 
    350  my $pageLen = $charsPerPage / $charsPerEntry;
    351  print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n";
    352  for (my $i = 0; $i < scalar @char; ++$i) {
    353    print DATA_TABLES "  {";
    354    print DATA_TABLES $char[$i];
    355    print DATA_TABLES $i < $#char ? "},\n" : "}\n";
    356  }
    357  print DATA_TABLES "};\n";
    358 
    359  my $dataSize = $pmCount * $indexLen * $pmBits/8 +
    360                 $chCount * $pageLen * $bytesPerEntry + 
    361                 $maxPlane;
    362  $totalData += $dataSize;
    363 
    364  print STDERR "Data for $prefix = $dataSize\n";
    365 }
    366 
    367 print DATA_TABLES <<__END;
    368 /*
    369 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
    370 */
    371 __END
    372 
    373 close DATA_TABLES;
    374 
    375 print HEADER "namespace mozilla::intl {\n";
    376 print HEADER "enum class Script : int16_t {\n";
    377 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) {
    378  print HEADER "  ", $scriptCodeToName[$i], " = ", $i, ",\n";
    379 }
    380 print HEADER "\n  NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n";
    381 print HEADER "\n  INVALID = -1\n";
    382 print HEADER "};\n";
    383 print HEADER "} // namespace mozilla::intl\n\n";
    384 
    385 print HEADER <<__END;
    386 #endif
    387 /*
    388 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * *
    389 */
    390 __END
    391 
    392 close HEADER;