genSpecialCasingData.pl (7738B)
1 #!/usr/bin/env perl 2 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this file, 5 # You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 # This tool is used to extract "special" (one-to-many) case mappings 8 # into a form that can be used by nsTextRunTransformations. 9 10 use strict; 11 12 if ($#ARGV != 1) { 13 print <<__EOT; 14 # Run this tool using a command line of the form 15 # 16 # perl genSpecialCasingData.pl UnicodeData.txt SpecialCasing.txt 17 # 18 # The nsSpecialCasingData.cpp file will be written to standard output. 19 # 20 # This tool will also write up-to-date versions of the test files 21 # all-{upper,lower,title}.html 22 # and corresponding -ref files in the current directory. 23 # 24 __EOT 25 exit 0; 26 } 27 28 my %allLower; 29 my %allUpper; 30 my %allTitle; 31 my %compositions; 32 my %gc; 33 open FH, "< $ARGV[0]" or die "can't open $ARGV[0] (should be UnicodeData.txt)\n"; 34 while (<FH>) { 35 chomp; 36 my @fields = split /;/; 37 next if ($fields[1] =~ /</); # ignore ranges etc 38 my $usv = hex "0x$fields[0]"; 39 $allUpper{$usv} = $fields[12] if $fields[12] ne ''; 40 $allLower{$usv} = $fields[13] if $fields[13] ne ''; 41 $allTitle{$usv} = $fields[14] if $fields[14] ne ''; 42 $gc{$usv} = $fields[2]; 43 # we only care about non-singleton canonical decomps 44 my $decomp = $fields[5]; 45 next if $decomp eq '' or $decomp =~ /</ or not $decomp =~ / /; 46 $compositions{$decomp} = sprintf("%04X", $usv); 47 } 48 close FH; 49 50 my %specialLower; 51 my %specialUpper; 52 my %specialTitle; 53 my %charName; 54 my @headerLines; 55 open FH, "< $ARGV[1]" or die "can't open $ARGV[1] (should be SpecialCasing.txt)\n"; 56 while (<FH>) { 57 chomp; 58 m/#\s*(.+)$/; 59 my $comment = $1; 60 if ($comment =~ /^(SpecialCasing-|Date:)/) { 61 push @headerLines, $comment; 62 next; 63 } 64 s/#.*//; 65 s/;\s*$//; 66 next if $_ eq ''; 67 my @fields = split /; */; 68 next unless (scalar @fields) == 4; 69 my $usv = hex "0x$fields[0]"; 70 addIfSpecial(\%specialLower, $usv, $fields[1]); 71 addIfSpecial(\%specialTitle, $usv, $fields[2]); 72 addIfSpecial(\%specialUpper, $usv, $fields[3]); 73 $charName{$usv} = $comment; 74 } 75 close FH; 76 77 print <<__END__; 78 /* This Source Code Form is subject to the terms of the Mozilla Public 79 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 80 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 81 82 /* Auto-generated from files in the Unicode Character Database 83 by genSpecialCasingData.pl - do not edit! */ 84 85 #include "nsSpecialCasingData.h" 86 #include <stdlib.h> // for bsearch 87 #include <array> // for std::size 88 89 __END__ 90 map { print "/* $_ */\n" } @headerLines; 91 92 print <<__END__; 93 94 using mozilla::unicode::MultiCharMapping; 95 96 __END__ 97 98 printMappings('Lower', \%specialLower); 99 printMappings('Upper', \%specialUpper); 100 printMappings('Title', \%specialTitle); 101 102 print <<__END__; 103 static int CompareMCM(const void* aKey, const void* aElement) 104 { 105 const uint32_t ch = *static_cast<const uint32_t*>(aKey); 106 const MultiCharMapping* mcm = static_cast<const MultiCharMapping*>(aElement); 107 return int(ch) - int(mcm->mOriginalChar); 108 } 109 110 #define MAKE_SPECIAL_CASE_ACCESSOR(which) \\ 111 const MultiCharMapping* \\ 112 Special##which(uint32_t aChar) \\ 113 { \\ 114 const void* p = bsearch(&aChar, CaseSpecials_##which, \\ 115 std::size(CaseSpecials_##which), \\ 116 sizeof(MultiCharMapping), CompareMCM); \\ 117 return static_cast<const MultiCharMapping*>(p); \\ 118 } 119 120 namespace mozilla { 121 namespace unicode { 122 123 MAKE_SPECIAL_CASE_ACCESSOR(Lower) 124 MAKE_SPECIAL_CASE_ACCESSOR(Upper) 125 MAKE_SPECIAL_CASE_ACCESSOR(Title) 126 127 } // namespace unicode 128 } // namespace mozilla 129 __END__ 130 131 addSpecialsTo(\%allLower, \%specialLower); 132 addSpecialsTo(\%allUpper, \%specialUpper); 133 addSpecialsTo(\%allTitle, \%specialTitle); 134 135 my $testFont = "../fonts/dejavu-sans/DejaVuSans.ttf"; 136 genTest('lower', \%allLower); 137 genTest('upper', \%allUpper); 138 genTitleTest(); 139 140 sub printMappings { 141 my ($whichMapping, $hash) = @_; 142 print "static const MultiCharMapping CaseSpecials_${whichMapping}[] = {\n"; 143 foreach my $key (sort { $a <=> $b } keys %$hash) { 144 my @chars = split(/ /, $hash->{$key}); 145 printf " { 0x%04x, {0x%04x, 0x%04x, 0x%04x} }, // %s\n", $key, 146 hex "0x0$chars[0]", hex "0x0$chars[1]", hex "0x0$chars[2]", 147 "$charName{$key}"; 148 } 149 print "};\n\n"; 150 }; 151 152 sub addIfSpecial { 153 my ($hash, $usv, $mapping) = @_; 154 return unless $mapping =~ / /; 155 # only do compositions that start with the initial char 156 foreach (keys %compositions) { 157 $mapping =~ s/^$_/$compositions{$_}/; 158 } 159 $hash->{$usv} = $mapping; 160 }; 161 162 sub addSpecialsTo { 163 my ($hash, $specials) = @_; 164 foreach my $key (keys %$specials) { 165 $hash->{$key} = $specials->{$key}; 166 } 167 }; 168 169 sub genTest { 170 my ($whichMapping, $hash) = @_; 171 open OUT, "> all-$whichMapping.html"; 172 print OUT <<__END__; 173 <!DOCTYPE html> 174 <!-- GENERATED FILE, DO NOT EDIT --> 175 <html> 176 <head> 177 <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 178 <style type="text/css"> 179 \@font-face { font-family: foo; src: url($testFont); } 180 p { font-family: foo; font-size: 12px; text-transform: ${whichMapping}case; } 181 </style> 182 </head> 183 <body> 184 <p> 185 __END__ 186 foreach my $key (sort { $a <=> $b } keys %$hash) { 187 printf OUT "&#x%04X;", $key; 188 print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 189 print OUT "\n"; 190 } 191 print OUT <<__END__; 192 </p> 193 </body> 194 </html> 195 __END__ 196 close OUT; 197 198 open OUT, "> all-$whichMapping-ref.html"; 199 print OUT <<__END__; 200 <!DOCTYPE html> 201 <!-- GENERATED FILE, DO NOT EDIT --> 202 <html> 203 <head> 204 <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 205 <style type="text/css"> 206 \@font-face { font-family: foo; src: url($testFont); } 207 p { font-family: foo; font-size: 12px; } 208 </style> 209 </head> 210 <body> 211 <p> 212 __END__ 213 foreach my $key (sort { $a <=> $b } keys %$hash) { 214 print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $hash->{$key})); 215 print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 216 print OUT "\n"; 217 } 218 print OUT <<__END__; 219 </p> 220 </body> 221 </html> 222 __END__ 223 close OUT; 224 }; 225 226 sub genTitleTest { 227 open OUT, "> all-title.html"; 228 print OUT <<__END__; 229 <!DOCTYPE html> 230 <!-- GENERATED FILE, DO NOT EDIT --> 231 <html> 232 <head> 233 <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 234 <style type="text/css"> 235 \@font-face { font-family: foo; src: url($testFont); } 236 p { font-family: foo; text-transform: capitalize; } 237 </style> 238 </head> 239 <body> 240 <p> 241 __END__ 242 foreach my $key (sort { $a <=> $b } keys %allTitle) { 243 printf OUT "&#x%04X;x", $key; 244 print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 245 print OUT "\n"; 246 } 247 print OUT <<__END__; 248 </p> 249 </body> 250 </html> 251 __END__ 252 close OUT; 253 254 open OUT, "> all-title-ref.html"; 255 print OUT <<__END__; 256 <!DOCTYPE html> 257 <!-- GENERATED FILE, DO NOT EDIT --> 258 <html> 259 <head> 260 <meta http-equiv="Content-type" content="text/html; charset=utf-8"> 261 <style type="text/css"> 262 \@font-face { font-family: foo; src: url($testFont); } 263 p { font-family: foo; } 264 </style> 265 </head> 266 <body> 267 <p> 268 __END__ 269 foreach my $key (sort { $a <=> $b } keys %allTitle) { 270 # capitalize is only applied to characters with GC=L* or N*... 271 if ($gc{$key} =~ /^[LN]/) { 272 # ...and those that are already uppercase are not transformed 273 if (exists $allUpper{$key}) { 274 print OUT join('', map { sprintf("&#x%s;", $_) } split(/ /, $allTitle{$key})); 275 } else { 276 printf OUT "&#x%04X;", $key; 277 } 278 print OUT "x"; 279 } else { 280 printf OUT "&#x%04X;X", $key; 281 } 282 print OUT " <!-- $charName{$key} -->" if exists $charName{$key}; 283 print OUT "\n"; 284 } 285 print OUT <<__END__; 286 </p> 287 </body> 288 </html> 289 __END__ 290 close OUT; 291 };