gensprep.c (14850B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: gensprep.c 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003-02-06 16 * created by: Ram Viswanadha 17 * 18 * This program reads the Profile.txt files, 19 * parses them, and extracts the data for StringPrep profile. 20 * It then preprocesses it and writes a binary file for efficient use 21 * in various StringPrep conversion processes. 22 */ 23 24 #define USPREP_TYPE_NAMES_ARRAY 1 25 26 #include <stdbool.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 30 #include "cmemory.h" 31 #include "cstring.h" 32 #include "toolutil.h" 33 #include "unewdata.h" 34 #include "uoptions.h" 35 #include "uparse.h" 36 #include "sprpimpl.h" 37 38 #include "unicode/uclean.h" 39 #include "unicode/udata.h" 40 #include "unicode/utypes.h" 41 #include "unicode/putil.h" 42 43 44 U_CDECL_BEGIN 45 #include "gensprep.h" 46 U_CDECL_END 47 48 UBool beVerbose=false, haveCopyright=true; 49 50 #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" 51 52 #define NORMALIZE_DIRECTIVE "normalize" 53 #define NORMALIZE_DIRECTIVE_LEN 9 54 #define CHECK_BIDI_DIRECTIVE "check-bidi" 55 #define CHECK_BIDI_DIRECTIVE_LEN 10 56 57 /* prototypes --------------------------------------------------------------- */ 58 59 static void 60 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); 61 62 static void 63 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); 64 65 66 /* -------------------------------------------------------------------------- */ 67 68 static UOption options[]={ 69 UOPTION_HELP_H, 70 UOPTION_HELP_QUESTION_MARK, 71 UOPTION_VERBOSE, 72 UOPTION_COPYRIGHT, 73 UOPTION_DESTDIR, 74 UOPTION_SOURCEDIR, 75 UOPTION_ICUDATADIR, 76 UOPTION_BUNDLE_NAME, 77 { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, 78 { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, 79 { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, 80 { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, 81 }; 82 83 enum{ 84 HELP, 85 HELP_QUESTION_MARK, 86 VERBOSE, 87 COPYRIGHT, 88 DESTDIR, 89 SOURCEDIR, 90 ICUDATADIR, 91 BUNDLE_NAME, 92 NORMALIZE, 93 NORM_CORRECTION_DIR, 94 CHECK_BIDI, 95 UNICODE_VERSION 96 }; 97 98 static int printHelp(int argc, char* argv[]){ 99 /* 100 * Broken into chucks because the C89 standard says the minimum 101 * required supported string length is 509 bytes. 102 */ 103 fprintf(stderr, 104 "Usage: %s [-options] [file_name]\n" 105 "\n" 106 "Read the files specified and\n" 107 "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" 108 "\n", 109 argv[0]); 110 fprintf(stderr, 111 "Options:\n" 112 "\t-h or -? or --help print this usage text\n" 113 "\t-v or --verbose verbose output\n" 114 "\t-c or --copyright include a copyright notice\n"); 115 fprintf(stderr, 116 "\t-d or --destdir destination directory, followed by the path\n" 117 "\t-s or --sourcedir source directory of ICU data, followed by the path\n" 118 "\t-b or --bundle-name generate the output data file with the name specified\n" 119 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 120 "\t followed by path, defaults to %s\n", 121 u_getDataDirectory()); 122 fprintf(stderr, 123 "\t-n or --normalize turn on the option for normalization and include mappings\n" 124 "\t from NormalizationCorrections.txt from the given path,\n" 125 "\t e.g: /test/icu/source/data/unidata\n"); 126 fprintf(stderr, 127 "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" 128 "\t when the input file contains a normalization directive.\n" 129 "\t unlike -n/--normalize, this option does not force the\n" 130 "\t normalization.\n"); 131 fprintf(stderr, 132 "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" 133 "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" 134 ); 135 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 136 } 137 138 139 extern int 140 main(int argc, char* argv[]) { 141 #if !UCONFIG_NO_IDNA 142 char* filename = NULL; 143 #endif 144 const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; 145 const char *bundleName=NULL, *inputFileName = NULL; 146 char *basename=NULL; 147 int32_t sprepOptions = 0; 148 149 UErrorCode errorCode=U_ZERO_ERROR; 150 151 U_MAIN_INIT_ARGS(argc, argv); 152 153 /* preset then read command line options */ 154 options[DESTDIR].value=u_getDataDirectory(); 155 options[SOURCEDIR].value=""; 156 options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ 157 options[BUNDLE_NAME].value = DATA_NAME; 158 options[NORMALIZE].value = ""; 159 160 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 161 162 /* error handling, printing usage message */ 163 if(argc<0) { 164 fprintf(stderr, 165 "error in command line argument \"%s\"\n", 166 argv[-argc]); 167 } 168 if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 169 return printHelp(argc, argv); 170 171 } 172 173 /* get the options values */ 174 beVerbose=options[VERBOSE].doesOccur; 175 haveCopyright=options[COPYRIGHT].doesOccur; 176 srcDir=options[SOURCEDIR].value; 177 destDir=options[DESTDIR].value; 178 bundleName = options[BUNDLE_NAME].value; 179 if(options[NORMALIZE].doesOccur) { 180 icuUniDataDir = options[NORMALIZE].value; 181 } else { 182 icuUniDataDir = options[NORM_CORRECTION_DIR].value; 183 } 184 185 if(argc<2) { 186 /* print the help message */ 187 return printHelp(argc, argv); 188 } else { 189 inputFileName = argv[1]; 190 } 191 if(!options[UNICODE_VERSION].doesOccur){ 192 return printHelp(argc, argv); 193 } 194 if(options[ICUDATADIR].doesOccur) { 195 u_setDataDirectory(options[ICUDATADIR].value); 196 } 197 #if UCONFIG_NO_IDNA 198 199 fprintf(stderr, 200 "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE 201 " because UCONFIG_NO_IDNA is set, \n" 202 "see icu/source/common/unicode/uconfig.h\n"); 203 generateData(destDir, bundleName); 204 205 #else 206 207 setUnicodeVersion(options[UNICODE_VERSION].value); 208 filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */ 209 210 /* prepare the filename beginning with the source dir */ 211 if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ 212 filename[0] = '.'; 213 filename[1] = U_FILE_SEP_CHAR; 214 uprv_strcpy(filename+2,srcDir); 215 }else{ 216 uprv_strcpy(filename, srcDir); 217 } 218 219 basename=filename+uprv_strlen(filename); 220 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 221 *basename++=U_FILE_SEP_CHAR; 222 } 223 224 /* initialize */ 225 init(); 226 227 /* process the file */ 228 uprv_strcpy(basename,inputFileName); 229 parseMappings(filename,false, &errorCode); 230 if(U_FAILURE(errorCode)) { 231 fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); 232 return errorCode; 233 } 234 235 if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ 236 /* set up directory for NormalizationCorrections.txt */ 237 uprv_strcpy(filename,icuUniDataDir); 238 basename=filename+uprv_strlen(filename); 239 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 240 *basename++=U_FILE_SEP_CHAR; 241 } 242 243 *basename++=U_FILE_SEP_CHAR; 244 uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); 245 246 parseNormalizationCorrections(filename,&errorCode); 247 if(U_FAILURE(errorCode)){ 248 fprintf(stderr,"Could not open file %s for reading \n", filename); 249 return errorCode; 250 } 251 sprepOptions |= _SPREP_NORMALIZATION_ON; 252 } 253 254 if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ 255 sprepOptions |= _SPREP_CHECK_BIDI_ON; 256 } 257 258 setOptions(sprepOptions); 259 260 /* process parsed data */ 261 if(U_SUCCESS(errorCode)) { 262 /* write the data file */ 263 generateData(destDir, bundleName); 264 265 cleanUpData(); 266 } 267 268 uprv_free(filename); 269 270 u_cleanup(); 271 272 #endif 273 274 return errorCode; 275 } 276 277 #if !UCONFIG_NO_IDNA 278 279 static void U_CALLCONV 280 normalizationCorrectionsLineFn(void *context, 281 char *fields[][2], int32_t fieldCount, 282 UErrorCode *pErrorCode) { 283 (void)context; // suppress compiler warnings about unused variable 284 (void)fieldCount; // suppress compiler warnings about unused variable 285 uint32_t mapping[40]; 286 char *end, *s; 287 uint32_t code; 288 int32_t length; 289 UVersionInfo version; 290 UVersionInfo thisVersion; 291 292 /* get the character code, field 0 */ 293 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); 294 if(U_FAILURE(*pErrorCode)) { 295 fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); 296 exit(*pErrorCode); 297 } 298 /* Original (erroneous) decomposition */ 299 s = fields[1][0]; 300 301 /* parse the mapping string */ 302 length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); 303 304 /* ignore corrected decomposition */ 305 306 u_versionFromString(version,fields[3][0] ); 307 u_versionFromString(thisVersion, "3.2.0"); 308 309 310 311 if(U_FAILURE(*pErrorCode)) { 312 fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", 313 (long)code, u_errorName(*pErrorCode)); 314 exit(*pErrorCode); 315 } 316 317 /* store the mapping */ 318 if( version[0] > thisVersion[0] || 319 ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) 320 ){ 321 storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); 322 } 323 setUnicodeVersionNC(version); 324 } 325 326 static void 327 parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { 328 char *fields[4][2]; 329 330 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 331 return; 332 } 333 334 u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); 335 336 /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ 337 338 if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { 339 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 340 exit(*pErrorCode); 341 } 342 } 343 344 static void U_CALLCONV 345 strprepProfileLineFn(void *context, 346 char *fields[][2], int32_t fieldCount, 347 UErrorCode *pErrorCode) { 348 (void)fieldCount; // suppress compiler warnings about unused variable 349 uint32_t mapping[40]; 350 char *end, *map; 351 uint32_t code; 352 int32_t length; 353 /*UBool* mapWithNorm = (UBool*) context;*/ 354 const char* typeName; 355 uint32_t rangeStart=0,rangeEnd =0; 356 const char* filename = (const char*) context; 357 const char *s; 358 359 s = u_skipWhitespace(fields[0][0]); 360 if (*s == '@') { 361 /* special directive */ 362 s++; 363 length = (int32_t)(fields[0][1] - s); 364 if (length >= NORMALIZE_DIRECTIVE_LEN 365 && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { 366 options[NORMALIZE].doesOccur = true; 367 return; 368 } 369 else if (length >= CHECK_BIDI_DIRECTIVE_LEN 370 && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { 371 options[CHECK_BIDI].doesOccur = true; 372 return; 373 } 374 else { 375 fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); 376 } 377 } 378 379 typeName = fields[2][0]; 380 map = fields[1][0]; 381 382 if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ 383 384 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 385 if(U_FAILURE(*pErrorCode)){ 386 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 387 return; 388 } 389 390 /* store the range */ 391 storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); 392 393 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ 394 395 u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); 396 if(U_FAILURE(*pErrorCode)){ 397 fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); 398 return; 399 } 400 401 /* store the range */ 402 storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); 403 404 }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ 405 406 /* get the character code, field 0 */ 407 code=(uint32_t)uprv_strtoul(s, &end, 16); 408 if(end<=s || end!=fields[0][1]) { 409 fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); 410 *pErrorCode=U_PARSE_ERROR; 411 exit(U_PARSE_ERROR); 412 } 413 414 /* parse the mapping string */ 415 length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); 416 417 /* store the mapping */ 418 storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); 419 420 }else{ 421 *pErrorCode = U_INVALID_FORMAT_ERROR; 422 } 423 424 if(U_FAILURE(*pErrorCode)) { 425 fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, 426 fields[0][0],fields[2][0],u_errorName(*pErrorCode)); 427 exit(*pErrorCode); 428 } 429 430 } 431 432 static void 433 parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { 434 char *fields[3][2]; 435 436 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 437 return; 438 } 439 440 u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); 441 442 /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ 443 444 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { 445 fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 446 exit(*pErrorCode); 447 } 448 } 449 450 451 #endif /* #if !UCONFIG_NO_IDNA */ 452 453 /* 454 * Hey, Emacs, please set the following: 455 * 456 * Local Variables: 457 * indent-tabs-mode: nil 458 * End: 459 * 460 */