genUnicodePropertyData.pl (11144B)
1 #!/usr/bin/env perl 2 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 # This tool is used to prepare lookup tables of Unicode character properties 8 # needed by gfx code to support text shaping operations. The properties are 9 # read from the Unicode Character Database and compiled into multi-level arrays 10 # for efficient lookup. 11 # 12 # Note that for most properties, we now rely on ICU; this tool and the tables 13 # it generates are used only for a couple of properties not readily exposed 14 # via ICU APIs. 15 # 16 # To regenerate the tables in nsUnicodePropertyData.cpp: 17 # 18 # (1) Download the current Unicode data files from 19 # 20 # https://www.unicode.org/Public/UNIDATA/ 21 # 22 # NB: not all the files are actually needed; currently, we require 23 # - UnicodeData.txt 24 # - ReadMe.txt (to record version/date of the UCD) 25 # - Unihan_Variants.txt (from Unihan.zip) 26 # though this may change if we find a need for additional properties. 27 # 28 # The Unicode data files listed above should be together in one directory. 29 # 30 # 31 # (2) Run this tool using a command line of the form 32 # 33 # perl genUnicodePropertyData.pl \ 34 # /path/to/icu/common/unicode \ 35 # /path/to/UCD-directory 36 # 37 # This will generate (or overwrite!) the files 38 # 39 # nsUnicodePropertyData.cpp 40 # UnicodeScriptCodes.h 41 # 42 # in the current directory. 43 44 use strict; 45 use List::Util qw(first); 46 47 if ($#ARGV != 1) { 48 print <<__EOT; 49 # Run this tool using a command line of the form 50 # 51 # perl genUnicodePropertyData.pl \\ 52 # /path/to/icu/common/unicode \\ 53 # /path/to/UCD-directory 54 # 55 # where icu/common/unicode is the directory containing ICU 'common' headers, 56 # and UCD-directory is a directory containing the current Unicode Character 57 # Database files (UnicodeData.txt, etc), available from 58 # https://www.unicode.org/Public/UNIDATA/, with additional resources as 59 # detailed in the source comments. 60 # 61 # This will generate (or overwrite!) the files 62 # 63 # nsUnicodePropertyData.cpp 64 # UnicodeScriptCodes.h 65 # 66 # in the current directory. 67 __EOT 68 exit 0; 69 } 70 71 my $ICU = $ARGV[0]; 72 my $UNICODE = $ARGV[1]; 73 74 my @scriptCodeToName; 75 76 my $sc = -1; 77 78 sub readIcuHeader 79 { 80 my $file = shift; 81 open FH, "< $ICU/$file" or die "can't open ICU header $ICU/$file\n"; 82 while (<FH>) { 83 # adjust for ICU vs UCD naming discrepancies 84 s/LANNA/TAI_THAM/; 85 s/MEITEI_MAYEK/MEETEI_MAYEK/; 86 s/ORKHON/OLD_TURKIC/; 87 s/MENDE/MENDE_KIKAKUI/; 88 s/SIGN_WRITING/SIGNWRITING/; 89 if (m|USCRIPT_([A-Z_]+)\s*=\s*([0-9]+),\s*/\*\s*([A-Z][a-z]{3})\s*\*/|) { 90 $sc = $2; 91 $scriptCodeToName[$sc] = $1; 92 } 93 } 94 close FH; 95 } 96 97 &readIcuHeader("uscript.h"); 98 99 die "didn't find ICU script codes\n" if $sc == -1; 100 101 # initialize default properties 102 my @hanVariant; 103 my @fullWidth; 104 my @fullWidthInverse; 105 for (my $i = 0; $i < 0x110000; ++$i) { 106 $hanVariant[$i] = 0; 107 $fullWidth[$i] = 0; 108 $fullWidthInverse[$i] = 0; 109 } 110 111 # read ReadMe.txt 112 my @versionInfo; 113 open FH, "< $UNICODE/ReadMe.txt" or die "can't open Unicode ReadMe.txt file\n"; 114 while (<FH>) { 115 chomp; 116 push @versionInfo, $_; 117 } 118 close FH; 119 120 # read UnicodeData.txt 121 open FH, "< $UNICODE/UnicodeData.txt" or die "can't open UCD file UnicodeData.txt\n"; 122 while (<FH>) { 123 chomp; 124 my @fields = split /;/; 125 if ($fields[1] =~ /First/) { 126 my $first = hex "0x$fields[0]"; 127 $_ = <FH>; 128 @fields = split /;/; 129 if ($fields[1] =~ /Last/) { 130 my $last = hex "0x$fields[0]"; 131 do { 132 if ($fields[1] =~ /CJK/) { 133 @hanVariant[$first] = 3; 134 } 135 $first++; 136 } while ($first <= $last); 137 } else { 138 die "didn't find Last code for range!\n"; 139 } 140 } else { 141 my $usv = hex "0x$fields[0]"; 142 if ($fields[1] =~ /CJK/) { 143 @hanVariant[$usv] = 3; 144 } 145 if ($fields[5] =~ /^<narrow>/) { 146 my $wideChar = hex(substr($fields[5], 9)); 147 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $wideChar > 0xffff; 148 $fullWidth[$usv] = $wideChar; 149 $fullWidthInverse[$wideChar] = $usv; 150 } 151 elsif ($fields[5] =~ /^<wide>/) { 152 my $narrowChar = hex(substr($fields[5], 7)); 153 die "didn't expect supplementary-plane values here" if $usv > 0xffff || $narrowChar > 0xffff; 154 $fullWidth[$narrowChar] = $usv; 155 $fullWidthInverse[$usv] = $narrowChar; 156 } 157 } 158 } 159 close FH; 160 161 open FH, "< $UNICODE/Unihan_Variants.txt" or die "can't open UCD file Unihan_Variants.txt (from Unihan.zip)\n"; 162 push @versionInfo, ""; 163 while (<FH>) { 164 chomp; 165 push @versionInfo, $_; 166 last if /Date:/; 167 } 168 my $savedusv = 0; 169 my $hasTC = 0; 170 my $hasSC = 0; 171 while (<FH>) { 172 chomp; 173 if (m/U\+([0-9A-F]{4,6})\s+k([^ ]+)Variant/) { 174 my $usv = hex "0x$1"; 175 if ($usv != $savedusv) { 176 unless ($savedusv == 0) { 177 if ($hasTC && !$hasSC) { 178 $hanVariant[$savedusv] = 1; 179 } elsif (!$hasTC && $hasSC) { 180 $hanVariant[$savedusv] = 2; 181 } 182 } 183 $savedusv = $usv; 184 $hasTC = 0; 185 $hasSC = 0; 186 } 187 if ($2 eq "Traditional") { 188 $hasTC = 1; 189 } 190 if ($2 eq "Simplified") { 191 $hasSC = 1; 192 } 193 } 194 } 195 close FH; 196 197 my $timestamp = gmtime(); 198 199 open DATA_TABLES, "> nsUnicodePropertyData.cpp" or die "unable to open nsUnicodePropertyData.cpp for output"; 200 201 my $licenseBlock = q[/* This Source Code Form is subject to the terms of the Mozilla Public 202 * License, v. 2.0. If a copy of the MPL was not distributed with this 203 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 204 205 /* 206 * Derived from the Unicode Character Database by genUnicodePropertyData.pl 207 * 208 * For Unicode terms of use, see http://www.unicode.org/terms_of_use.html 209 */ 210 ]; 211 212 my $versionInfo = join("\n", @versionInfo); 213 214 print DATA_TABLES <<__END; 215 $licenseBlock 216 /* 217 * Created on $timestamp from UCD data files with version info: 218 * 219 220 $versionInfo 221 222 * 223 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 224 */ 225 226 #include <stdint.h> 227 #include "harfbuzz/hb.h" 228 229 __END 230 231 open HEADER, "> UnicodeScriptCodes.h" or die "unable to open UnicodeScriptCodes.h for output"; 232 233 print HEADER <<__END; 234 $licenseBlock 235 /* 236 * Created on $timestamp from UCD data files with version info: 237 * 238 239 $versionInfo 240 241 * 242 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 243 */ 244 245 #ifndef intl_components_UnicodeScriptCodes_h_ 246 #define intl_components_UnicodeScriptCodes_h_ 247 248 __END 249 250 our $totalData = 0; 251 252 sub sprintHanVariants 253 { 254 my $baseUsv = shift; 255 my $varShift = 0; 256 my $val = 0; 257 while ($varShift < 8) { 258 $val |= $hanVariant[$baseUsv++] << $varShift; 259 $varShift += 2; 260 } 261 return sprintf("0x%02x,", $val); 262 } 263 ## Han Variant data currently unused but may be needed in future, see bug 857481 264 ## &genTables("HanVariant", "", "uint8_t", 9, 7, \&sprintHanVariants, 2, 1, 4); 265 266 sub sprintFullWidth 267 { 268 my $usv = shift; 269 return sprintf("0x%04x,", $fullWidth[$usv]); 270 } 271 &genTables("FullWidth", "", "uint16_t", 10, 6, \&sprintFullWidth, 0, 2, 1); 272 273 sub sprintFullWidthInverse 274 { 275 my $usv = shift; 276 return sprintf("0x%04x,", $fullWidthInverse[$usv]); 277 } 278 &genTables("FullWidthInverse", "", "uint16_t", 10, 6, \&sprintFullWidthInverse, 0, 2, 1); 279 280 print STDERR "Total data = $totalData\n"; 281 282 sub genTables 283 { 284 my ($prefix, $typedef, $type, $indexBits, $charBits, $func, $maxPlane, $bytesPerEntry, $charsPerEntry) = @_; 285 286 if ($typedef ne '') { 287 print HEADER "$typedef\n"; 288 } 289 290 print DATA_TABLES "#define k${prefix}MaxPlane $maxPlane\n"; 291 print DATA_TABLES "#define k${prefix}IndexBits $indexBits\n"; 292 print DATA_TABLES "#define k${prefix}CharBits $charBits\n"; 293 294 my $indexLen = 1 << $indexBits; 295 my $charsPerPage = 1 << $charBits; 296 my %charIndex = (); 297 my %pageMapIndex = (); 298 my @pageMap = (); 299 my @char = (); 300 301 my $planeMap = "\x00" x $maxPlane; 302 foreach my $plane (0 .. $maxPlane) { 303 my $pageMap = "\x00" x $indexLen * 2; 304 foreach my $page (0 .. $indexLen - 1) { 305 my $charValues = ""; 306 for (my $ch = 0; $ch < $charsPerPage; $ch += $charsPerEntry) { 307 my $usv = $plane * 0x10000 + $page * $charsPerPage + $ch; 308 $charValues .= &$func($usv); 309 } 310 chop $charValues; 311 312 unless (exists $charIndex{$charValues}) { 313 $charIndex{$charValues} = scalar keys %charIndex; 314 $char[$charIndex{$charValues}] = $charValues; 315 } 316 substr($pageMap, $page * 2, 2) = pack('S', $charIndex{$charValues}); 317 } 318 319 unless (exists $pageMapIndex{$pageMap}) { 320 $pageMapIndex{$pageMap} = scalar keys %pageMapIndex; 321 $pageMap[$pageMapIndex{$pageMap}] = $pageMap; 322 } 323 if ($plane > 0) { 324 substr($planeMap, $plane - 1, 1) = pack('C', $pageMapIndex{$pageMap}); 325 } 326 } 327 328 if ($maxPlane) { 329 print DATA_TABLES "static const uint8_t s${prefix}Planes[$maxPlane] = {"; 330 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('C*', $planeMap)); 331 print DATA_TABLES "};\n\n"; 332 } 333 334 my $chCount = scalar @char; 335 my $pmBits = $chCount > 255 ? 16 : 8; 336 my $pmCount = scalar @pageMap; 337 if ($maxPlane == 0) { 338 die "there should only be one pageMap entry!" if $pmCount > 1; 339 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$indexLen] = {\n"; 340 } else { 341 print DATA_TABLES "static const uint${pmBits}_t s${prefix}Pages[$pmCount][$indexLen] = {\n"; 342 } 343 for (my $i = 0; $i < scalar @pageMap; ++$i) { 344 print DATA_TABLES $maxPlane > 0 ? " {" : " "; 345 print DATA_TABLES join(',', map { sprintf("%d", $_) } unpack('S*', $pageMap[$i])); 346 print DATA_TABLES $maxPlane > 0 ? ($i < $#pageMap ? "},\n" : "}\n") : "\n"; 347 } 348 print DATA_TABLES "};\n\n"; 349 350 my $pageLen = $charsPerPage / $charsPerEntry; 351 print DATA_TABLES "static const $type s${prefix}Values[$chCount][$pageLen] = {\n"; 352 for (my $i = 0; $i < scalar @char; ++$i) { 353 print DATA_TABLES " {"; 354 print DATA_TABLES $char[$i]; 355 print DATA_TABLES $i < $#char ? "},\n" : "}\n"; 356 } 357 print DATA_TABLES "};\n"; 358 359 my $dataSize = $pmCount * $indexLen * $pmBits/8 + 360 $chCount * $pageLen * $bytesPerEntry + 361 $maxPlane; 362 $totalData += $dataSize; 363 364 print STDERR "Data for $prefix = $dataSize\n"; 365 } 366 367 print DATA_TABLES <<__END; 368 /* 369 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 370 */ 371 __END 372 373 close DATA_TABLES; 374 375 print HEADER "namespace mozilla::intl {\n"; 376 print HEADER "enum class Script : int16_t {\n"; 377 for (my $i = 0; $i < scalar @scriptCodeToName; ++$i) { 378 print HEADER " ", $scriptCodeToName[$i], " = ", $i, ",\n"; 379 } 380 print HEADER "\n NUM_SCRIPT_CODES = ", scalar @scriptCodeToName, ",\n"; 381 print HEADER "\n INVALID = -1\n"; 382 print HEADER "};\n"; 383 print HEADER "} // namespace mozilla::intl\n\n"; 384 385 print HEADER <<__END; 386 #endif 387 /* 388 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 389 */ 390 __END 391 392 close HEADER;