anzx4051.pl (9024B)
1 #!/usr/bin/perl 2 # 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 ###################################################################### 8 # 9 # Initial global variable 10 # 11 ###################################################################### 12 %utot = (); 13 $ui=0; 14 $li=0; 15 16 ###################################################################### 17 # 18 # Open the unicode database file 19 # 20 ###################################################################### 21 open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") 22 || die "cannot find UnicodeData-Latest.txt"; 23 24 ###################################################################### 25 # 26 # Open the JIS X 4051 Class file 27 # 28 ###################################################################### 29 open ( CLASS , "< jisx4051class.txt") 30 || die "cannot find jisx4051class.txt"; 31 32 ###################################################################### 33 # 34 # Open the JIS X 4051 Class simplified mapping 35 # 36 ###################################################################### 37 open ( SIMP , "< jisx4051simp.txt") 38 || die "cannot find jisx4051simp.txt"; 39 40 ###################################################################### 41 # 42 # Open the output file 43 # 44 ###################################################################### 45 open ( OUT , "> anzx4051.html") 46 || die "cannot open output anzx4051.html file"; 47 48 ###################################################################### 49 # 50 # Open the output file 51 # 52 ###################################################################### 53 open ( HEADER , "> ../jisx4051class.h") 54 || die "cannot open output ../jisx4051class.h file"; 55 56 ###################################################################### 57 # 58 # Generate license and header 59 # 60 ###################################################################### 61 $hthmlheader = <<END_OF_HTML; 62 <!-- This Source Code Form is subject to the terms of the Mozilla Public 63 - License, v. 2.0. If a copy of the MPL was not distributed with this 64 - file, You can obtain one at http://mozilla.org/MPL/2.0/. --> 65 66 <HTML> 67 <HEAD> 68 <TITLE> 69 Analysis of JIS X 4051 to Unicode General Category Mapping 70 </TITLE> 71 </HEAD> 72 <BODY> 73 <H1> 74 Analysis of JIS X 4051 to Unicode General Category Mapping 75 </H1> 76 END_OF_HTML 77 print OUT $hthmlheader; 78 79 ###################################################################### 80 # 81 # Generate license and header 82 # 83 ###################################################################### 84 $npl = <<END_OF_NPL; 85 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 86 /* This Source Code Form is subject to the terms of the Mozilla Public 87 * License, v. 2.0. If a copy of the MPL was not distributed with this 88 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 89 /* 90 DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY 91 mozilla/intl/lwbrk/tools/anzx4051.pl 92 */ 93 END_OF_NPL 94 print HEADER $npl; 95 96 %occ = (); 97 %gcat = (); 98 %dcat = (); 99 %simp = (); 100 %gcount = (); 101 %dcount = (); 102 %sccount = (); 103 %rangecount = (); 104 105 ###################################################################### 106 # 107 # Process the file line by line 108 # 109 ###################################################################### 110 while(<UNICODATA>) { 111 chop; 112 ###################################################################### 113 # 114 # Get value from fields 115 # 116 ###################################################################### 117 @f = split(/;/ , $_); 118 $c = $f[0]; # The unicode value 119 $g = $f[2]; 120 $d = substr($g, 0, 1); 121 122 $gcat{$c} = $g; 123 $dcat{$c} = $d; 124 $gcount{$g}++; 125 $dcount{$d}++; 126 } 127 close(UNIDATA); 128 129 while(<SIMP>) { 130 chop; 131 ###################################################################### 132 # 133 # Get value from fields 134 # 135 ###################################################################### 136 @f = split(/;/ , $_); 137 138 $simp{$f[0]} = $f[1]; 139 $sccount{$f[1]}++; 140 } 141 close(SIMP); 142 143 sub GetClass{ 144 my ($u) = @_; 145 my $hex = DecToHex($u); 146 $g = $gcat{$hex}; 147 if($g ne "") { 148 return $g; 149 } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { 150 return "Han"; 151 } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { 152 return "Lo"; 153 } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { 154 return "Cs"; 155 } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { 156 return "Cs"; 157 } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { 158 return "Cs"; 159 } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { 160 return "Co"; 161 } else { 162 printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; 163 } 164 } 165 sub GetDClass{ 166 my ($u) = @_; 167 my $hex = DecToHex($u); 168 $g = $dcat{$hex}; 169 if($g ne "") { 170 return $g; 171 } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { 172 return "Han"; 173 } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { 174 return "L"; 175 } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { 176 return "C"; 177 } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { 178 return "C"; 179 } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { 180 return "C"; 181 } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { 182 return "C"; 183 } else { 184 printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; 185 } 186 } 187 sub DecToHex{ 188 my ($d) = @_; 189 return sprintf("%04X", $d); 190 } 191 %gtotal = (); 192 %dtotal = (); 193 while(<CLASS>) { 194 chop; 195 ###################################################################### 196 # 197 # Get value from fields 198 # 199 ###################################################################### 200 @f = split(/;/ , $_); 201 202 if( substr($f[2], 0, 1) ne "a") 203 { 204 $sc = $simp{$f[2]}; 205 $l = hex($f[0]); 206 if($f[1] eq "") 207 { 208 $h = $l; 209 } else { 210 $h = hex($f[1]); 211 } 212 for($k = $l; $k <= $h ; $k++) 213 { 214 if( exists($occ{$k})) 215 { 216 # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", 217 # DecToHex($k), $occ{$k} , $f[2] , $sc; 218 } 219 else 220 { 221 $occ{$k} = $sc . " | " . $f[2]; 222 $gclass = GetClass($k); 223 $dclass = GetDClass($k); 224 $gtotal{$sc . $gclass}++; 225 $dtotal{$sc . $dclass}++; 226 $u = DecToHex($k); 227 $rk = " " . substr($u,0,2) . ":" . $sc; 228 $rangecount{$rk}++; 229 } 230 } 231 } 232 } 233 234 #print %gtotal; 235 #print %dtotal; 236 237 sub printreport 238 { 239 print OUT "<TABLE BORDER=3>\n"; 240 print OUT "<TR BGCOLOR=blue><TH><TH>\n"; 241 242 foreach $d (sort(keys %dcount)) { 243 print OUT "<TD BGCOLOR=red>$d</TD>\n"; 244 } 245 246 print OUT "<TD BGCOLOR=white>Total</TD>\n"; 247 foreach $g (sort(keys %gcount)) { 248 print OUT "<TD BGCOLOR=yellow>$g</TD>\n"; 249 } 250 print OUT "</TR>\n"; 251 foreach $sc (sort(keys %sccount)) { 252 253 print OUT "<TR><TH>$sc<TH>\n"; 254 255 $total = 0; 256 foreach $d (sort (keys %dcount)) { 257 $count = $dtotal{$sc . $d}; 258 $total += $count; 259 print OUT "<TD>$count</TD>\n"; 260 } 261 262 print OUT "<TD BGCOLOR=white>$total</TD>\n"; 263 264 foreach $g (sort(keys %gcount)) { 265 $count = $gtotal{$sc . $g}; 266 print OUT "<TD>$count</TD>\n"; 267 } 268 269 270 print OUT "</TR>\n"; 271 } 272 print OUT "</TABLE>\n"; 273 274 275 print OUT "<TABLE BORDER=3>\n"; 276 print OUT "<TR BGCOLOR=blue><TH><TH>\n"; 277 278 foreach $sc (sort(keys %sccount)) 279 { 280 print OUT "<TD BGCOLOR=red>$sc</TD>\n"; 281 } 282 283 print OUT "</TR>\n"; 284 285 286 for($rr = 0; $rr < 0x4f; $rr++) 287 { 288 $empty = 0; 289 $r = sprintf("%02X" , $rr) ; 290 $tmp = "<TR><TH>" . $r . "<TH>\n"; 291 292 foreach $sc (sort(keys %sccount)) { 293 $count = $rangecount{ " " .$r . ":" .$sc}; 294 $tmp .= sprintf("<TD>%s</TD>\n", $count); 295 $empty += $count; 296 } 297 298 $tmp .= "</TR>\n"; 299 300 if($empty ne 0) 301 { 302 print OUT $tmp; 303 } 304 } 305 print OUT "</TABLE>\n"; 306 307 } 308 printreport(); 309 310 sub printarray 311 { 312 my($r, $def) = @_; 313 printf "[%s || %s]\n", $r, $def; 314 $k = hex($r) * 256; 315 printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; 316 for($i = 0 ; $i < 256; $i+= 8) 317 { 318 for($j = 7 ; $j >= 0; $j-- ) 319 { 320 $v = $k + $i + $j; 321 if( exists($occ{$v})) 322 { 323 $p = substr($occ{$v}, 1,1); 324 } else { 325 $p = $def; 326 } 327 328 if($j eq 7 ) 329 { 330 printf HEADER "0x%s" , $p; 331 } else { 332 printf HEADER "%s", $p ; 333 } 334 } 335 printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); 336 } 337 print HEADER "};\n\n"; 338 } 339 printarray("00", "7"); 340 printarray("20", "7"); 341 printarray("21", "7"); 342 printarray("30", "5"); 343 printarray("0E", "8"); 344 printarray("17", "7"); 345 346 #print %rangecount; 347 348 ###################################################################### 349 # 350 # Close files 351 # 352 ###################################################################### 353 close(HEADER); 354 close(CLASS); 355 close(OUT);