cldr-quotes.pl (3898B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 # Tool to generate the cldr-quotes.inc file, to be #include'd in Quotes.cpp 6 # to provide locale-appropriate opening and closing quote marks. 7 8 # To regenerate cldr-quotes.inc for a new CLDR release, download the data file 9 # "cldr-common-##.zip" from http://unicode.org/Public/cldr/latest into the 10 # current directory, run 11 # 12 # perl cldr-quotes.pl <filename> > cldr-quotes.inc 13 # 14 # (where <filename> is the downloaded cldr-common-## archive), and 15 # then use `hg diff` to check that the result looks sane. 16 17 use warnings; 18 use strict; 19 20 use Encode; 21 use IO::Uncompress::Unzip "unzip"; 22 23 die "Usage: perl cldr-quotes.pl <filename>" unless $#ARGV == 0; 24 25 my $filename = $ARGV[0]; 26 27 my (%langQuotes, %quoteLangs); 28 29 my $zip = IO::Uncompress::Unzip->new($filename) || 30 die "unzip failed: $IO::Uncompress::Unzip::UnzipError\n"; 31 32 my $status = 1; 33 while ($status > 0) { 34 my $name = $zip->getHeaderInfo()->{Name}; 35 if ($name =~ m@common/main/([A-Za-z0-9_]+)\.xml@) { 36 my $lang = $1; 37 $lang =~ s/_/-/; 38 39 $langQuotes{$lang}[0] = ""; 40 $langQuotes{$lang}[1] = ""; 41 $langQuotes{$lang}[2] = ""; 42 $langQuotes{$lang}[3] = ""; 43 44 while (<$zip>) { 45 $langQuotes{$lang}[0] = $1 if (m!<quotationStart>(.+)<!); 46 $langQuotes{$lang}[1] = $1 if (m!<quotationEnd>(.+)<!); 47 $langQuotes{$lang}[2] = $1 if (m!<alternateQuotationStart>(.+)<!); 48 $langQuotes{$lang}[3] = $1 if (m!<alternateQuotationEnd>(.+)<!); 49 } 50 } 51 $status = $zip->nextStream(); 52 } 53 $zip->close; 54 55 foreach my $lang (sort keys %langQuotes) { 56 # We don't actually want to emit anything for the root locale 57 next if $lang eq "root"; 58 59 # Inherit any missing entries from the locale's parent 60 my $parent = $lang; 61 while ($parent =~ m/\-/) { 62 # Strip off a trailing subtag to find a parent locale code 63 $parent =~ s/\-[^-]+$//; 64 # Fill in any values available from the parent 65 for (my $i = 0; $i < 4; $i++) { 66 $langQuotes{$lang}[$i] = $langQuotes{$parent}[$i] unless $langQuotes{$lang}[$i]; 67 } 68 } 69 70 # Anything still missing is copied from the root locale 71 for (my $i = 0; $i < 4; $i++) { 72 $langQuotes{$lang}[$i] = $langQuotes{"root"}[$i] unless $langQuotes{$lang}[$i]; 73 } 74 75 # If the locale ends up the same as its parent, skip 76 next if ($parent ne $lang) && (exists $langQuotes{$parent}) && 77 (join(",", @{$langQuotes{$lang}}) eq join(",", @{$langQuotes{$parent}})); 78 79 # Create a string with the C source form for the array of 4 quote characters 80 my $quoteChars = join(", ", map { sprintf("0x%x", ord Encode::decode("UTF-8", $_)) } @{$langQuotes{$lang}}); 81 82 # Record this locale in the list of those which use this particular set of quotes 83 $quoteLangs{$quoteChars} = [] unless exists $quoteLangs{$quoteChars}; 84 push @{$quoteLangs{$quoteChars}}, $lang; 85 } 86 87 # Output each unique list of quotes, with the string of associated locales 88 my $timestamp = gmtime(); 89 print <<__EOT__; 90 /* This Source Code Form is subject to the terms of the Mozilla Public 91 * License, v. 2.0. If a copy of the MPL was not distributed with this 92 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 93 94 /* 95 * Derived from the Unicode Common Locale Data Repository by cldr-quotes.pl. 96 * 97 * For terms of use, see http://www.unicode.org/copyright.html. 98 */ 99 100 /* 101 * Created on $timestamp from CLDR data file $filename. 102 * 103 * * * * * This file contains MACHINE-GENERATED DATA, do not edit! * * * * * 104 * 105 * (generated by intl/locale/cldr-quotes.pl) 106 */ 107 108 __EOT__ 109 110 print "static const LangQuotesRec sLangQuotes[] = {\n"; 111 print " // clang-format off\n"; 112 print sort map { sprintf(" { \"%s\\0\", { { %s } } },\n", join("\\0", sort @{$quoteLangs{$_}}), $_) } (keys %quoteLangs); 113 print " // clang-format on\n"; 114 print "};\n";