gennorm2.cpp (12174B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2009-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: gennorm2.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2009nov25 16 * created by: Markus W. Scherer 17 * 18 * This program reads text files that define Unicode normalization, 19 * parses them, and builds a binary data file. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "n2builder.h" 24 25 #include <fstream> 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string> 29 #include <string.h> 30 #include "unicode/errorcode.h" 31 #include "unicode/localpointer.h" 32 #include "unicode/putil.h" 33 #include "unicode/uchar.h" 34 #include "unicode/unistr.h" 35 #include "charstr.h" 36 #include "normalizer2impl.h" 37 #include "toolutil.h" 38 #include "uoptions.h" 39 #include "uparse.h" 40 41 #if UCONFIG_NO_NORMALIZATION 42 #include "unewdata.h" 43 #endif 44 45 U_NAMESPACE_BEGIN 46 47 UBool beVerbose=false, haveCopyright=true; 48 49 #if !UCONFIG_NO_NORMALIZATION 50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder); 51 #endif 52 53 /* -------------------------------------------------------------------------- */ 54 55 enum { 56 HELP_H, 57 HELP_QUESTION_MARK, 58 VERBOSE, 59 COPYRIGHT, 60 SOURCEDIR, 61 OUTPUT_FILENAME, 62 UNICODE_VERSION, 63 WRITE_C_SOURCE, 64 WRITE_COMBINED_DATA, 65 OPT_FAST 66 }; 67 68 static UOption options[]={ 69 UOPTION_HELP_H, 70 UOPTION_HELP_QUESTION_MARK, 71 UOPTION_VERBOSE, 72 UOPTION_COPYRIGHT, 73 UOPTION_SOURCEDIR, 74 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), 75 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 76 UOPTION_DEF("csource", '\1', UOPT_NO_ARG), 77 UOPTION_DEF("combined", '\1', UOPT_NO_ARG), 78 UOPTION_DEF("fast", '\1', UOPT_NO_ARG) 79 }; 80 81 U_NAMESPACE_END 82 83 int 84 main(int argc, char* argv[]) { 85 U_NAMESPACE_USE 86 U_MAIN_INIT_ARGS(argc, argv); 87 88 /* preset then read command line options */ 89 options[SOURCEDIR].value=""; 90 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); 91 92 /* error handling, printing usage message */ 93 if(argc<0) { 94 fprintf(stderr, 95 "error in command line argument \"%s\"\n", 96 argv[-argc]); 97 } 98 if(!options[OUTPUT_FILENAME].doesOccur) { 99 argc=-1; 100 } 101 if( argc<2 || 102 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur 103 ) { 104 fprintf(stderr, 105 "Usage: %s [-options] infiles+ -o outputfilename\n" 106 "\n" 107 "Reads the infiles with normalization data and\n" 108 "creates a binary file, or a C source file (--csource), with the data,\n" 109 "or writes a data file with the combined data (--combined).\n" 110 "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n" 111 "\n" 112 "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" 113 "\n" 114 "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" 115 "in input-file syntax to the outputfilename.\n" 116 "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" 117 "(Useful for computing minimal incremental mapping data files.)\n" 118 "\n", 119 argv[0], argv[0]); 120 fprintf(stderr, 121 "Options:\n" 122 "\t-h or -? or --help this usage text\n" 123 "\t-v or --verbose verbose output\n" 124 "\t-c or --copyright include a copyright notice\n" 125 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); 126 fprintf(stderr, 127 "\t-s or --sourcedir source directory, followed by the path\n" 128 "\t-o or --output output filename\n" 129 "\t --csource writes a C source file with initializers\n" 130 "\t --combined writes a .txt file (input-file syntax) with the\n" 131 "\t combined data from all of the input files\n"); 132 fprintf(stderr, 133 "\t --fast optimize the data for fast normalization,\n" 134 "\t which might increase its size (Writes fully decomposed\n" 135 "\t regular mappings instead of delta mappings.\n" 136 "\t You should measure the runtime speed to make sure that\n" 137 "\t this is a good trade-off.)\n"); 138 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 139 } 140 141 beVerbose=options[VERBOSE].doesOccur; 142 haveCopyright=options[COPYRIGHT].doesOccur; 143 144 IcuToolErrorCode errorCode("gennorm2/main()"); 145 146 #if UCONFIG_NO_NORMALIZATION 147 148 fprintf(stderr, 149 "gennorm2 writes a dummy binary data file " 150 "because UCONFIG_NO_NORMALIZATION is set, \n" 151 "see icu/source/common/unicode/uconfig.h\n"); 152 udata_createDummy(nullptr, nullptr, options[OUTPUT_FILENAME].value, errorCode); 153 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. 154 // return U_UNSUPPORTED_ERROR; 155 return 0; 156 157 #else 158 159 LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode); 160 LocalPointer<Normalizer2DataBuilder> b2; 161 LocalPointer<Normalizer2DataBuilder> diff; 162 Normalizer2DataBuilder *builder = b1.getAlias(); 163 errorCode.assertSuccess(); 164 165 if(options[UNICODE_VERSION].doesOccur) { 166 builder->setUnicodeVersion(options[UNICODE_VERSION].value); 167 } 168 169 if(options[OPT_FAST].doesOccur) { 170 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 171 } 172 173 // prepare the filename beginning with the source dir 174 CharString filename(options[SOURCEDIR].value, errorCode); 175 int32_t pathLength=filename.length(); 176 if( pathLength>0 && 177 filename[pathLength-1]!=U_FILE_SEP_CHAR && 178 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR 179 ) { 180 filename.append(U_FILE_SEP_CHAR, errorCode); 181 pathLength=filename.length(); 182 } 183 184 bool doMinus = false; 185 for(int i=1; i<argc; ++i) { 186 printf("gennorm2: processing %s\n", argv[i]); 187 if(strcmp(argv[i], "minus") == 0) { 188 if(doMinus) { 189 fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n"); 190 exit(U_ILLEGAL_ARGUMENT_ERROR); 191 } 192 // Data from previous input files has been collected in b1. 193 // Collect data from further input files in b2. 194 b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); 195 diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); 196 errorCode.assertSuccess(); 197 builder = b2.getAlias(); 198 if(options[UNICODE_VERSION].doesOccur) { 199 builder->setUnicodeVersion(options[UNICODE_VERSION].value); 200 } 201 if(options[OPT_FAST].doesOccur) { 202 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); 203 } 204 doMinus = true; 205 continue; 206 } 207 filename.append(argv[i], errorCode); 208 std::ifstream f(filename.data()); 209 if(f.fail()) { 210 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); 211 exit(U_FILE_ACCESS_ERROR); 212 } 213 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); 214 parseFile(f, *builder); 215 filename.truncate(pathLength); 216 } 217 218 if(doMinus) { 219 Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff); 220 diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true); 221 } else if(options[WRITE_COMBINED_DATA].doesOccur) { 222 builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false); 223 } else if(options[WRITE_C_SOURCE].doesOccur) { 224 builder->writeCSourceFile(options[OUTPUT_FILENAME].value); 225 } else { 226 builder->writeBinaryFile(options[OUTPUT_FILENAME].value); 227 } 228 229 return errorCode.get(); 230 231 #endif 232 } 233 234 U_NAMESPACE_BEGIN 235 236 #if !UCONFIG_NO_NORMALIZATION 237 238 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) { 239 IcuToolErrorCode errorCode("gennorm2/parseFile()"); 240 std::string lineString; 241 uint32_t startCP, endCP; 242 while(std::getline(f, lineString)) { 243 if (lineString.empty()) { 244 continue; // skip empty lines. 245 } 246 char *line = &lineString.front(); 247 char* comment = strchr(line, '#'); 248 if(comment!=nullptr) { 249 *comment=0; 250 } 251 u_rtrim(line); 252 if(line[0]==0) { 253 continue; // skip empty and comment-only lines 254 } 255 if(line[0]=='*') { 256 const char *s=u_skipWhitespace(line+1); 257 if(0==strncmp(s, "Unicode", 7)) { 258 s=u_skipWhitespace(s+7); 259 builder.setUnicodeVersion(s); 260 } 261 continue; // reserved syntax 262 } 263 const char *delimiter; 264 int32_t rangeLength= 265 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); 266 if(errorCode.isFailure()) { 267 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); 268 exit(errorCode.reset()); 269 } 270 if (endCP >= 0xd800 && startCP <= 0xdfff) { 271 fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n", 272 line); 273 exit(U_ILLEGAL_ARGUMENT_ERROR); 274 } 275 delimiter=u_skipWhitespace(delimiter); 276 if(*delimiter==':') { 277 const char *s=u_skipWhitespace(delimiter+1); 278 char *end; 279 unsigned long value=strtoul(s, &end, 10); 280 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { 281 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); 282 exit(U_PARSE_ERROR); 283 } 284 for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) { 285 builder.setCC(c, static_cast<uint8_t>(value)); 286 } 287 continue; 288 } 289 if(*delimiter=='-') { 290 if(*u_skipWhitespace(delimiter+1)!=0) { 291 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); 292 exit(U_PARSE_ERROR); 293 } 294 for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) { 295 builder.removeMapping(c); 296 } 297 continue; 298 } 299 if(*delimiter=='=' || *delimiter=='>') { 300 char16_t uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; 301 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), nullptr, errorCode); 302 if(errorCode.isFailure()) { 303 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); 304 exit(errorCode.reset()); 305 } 306 UnicodeString mapping(false, uchars, length); 307 if(*delimiter=='=') { 308 if(rangeLength!=1) { 309 fprintf(stderr, 310 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", 311 line); 312 exit(U_PARSE_ERROR); 313 } 314 builder.setRoundTripMapping(static_cast<UChar32>(startCP), mapping); 315 } else { 316 for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) { 317 builder.setOneWayMapping(c, mapping); 318 } 319 } 320 continue; 321 } 322 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); 323 exit(U_PARSE_ERROR); 324 } 325 } 326 327 #endif // !UCONFIG_NO_NORMALIZATION 328 329 U_NAMESPACE_END 330 331 /* 332 * Hey, Emacs, please set the following: 333 * 334 * Local Variables: 335 * indent-tabs-mode: nil 336 * End: 337 * 338 */