makeconv.cpp (29821B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************** 5 * 6 * Copyright (C) 1998-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************** 10 * 11 * 12 * makeconv.cpp: 13 * tool creating a binary (compressed) representation of the conversion mapping 14 * table (IBM NLTC ucmap format). 15 * 16 * 05/04/2000 helena Added fallback mapping into the picture... 17 * 06/29/2000 helena Major rewrite of the callback APIs. 18 */ 19 20 #include <stdio.h> 21 #include "unicode/putil.h" 22 #include "unicode/ucnv_err.h" 23 #include "charstr.h" 24 #include "ucnv_bld.h" 25 #include "ucnv_imp.h" 26 #include "ucnv_cnv.h" 27 #include "cstring.h" 28 #include "cmemory.h" 29 #include "uinvchar.h" 30 #include "filestrm.h" 31 #include "toolutil.h" 32 #include "uoptions.h" 33 #include "unicode/udata.h" 34 #include "unewdata.h" 35 #include "uparse.h" 36 #include "ucm.h" 37 #include "makeconv.h" 38 #include "genmbcs.h" 39 40 #define DEBUG 0 41 42 typedef struct ConvData { 43 UCMFile *ucm; 44 NewConverter *cnvData, *extData; 45 UConverterSharedData sharedData; 46 UConverterStaticData staticData; 47 } ConvData; 48 49 static void 50 initConvData(ConvData *data) { 51 uprv_memset(data, 0, sizeof(ConvData)); 52 data->sharedData.structSize=sizeof(UConverterSharedData); 53 data->staticData.structSize=sizeof(UConverterStaticData); 54 data->sharedData.staticData=&data->staticData; 55 } 56 57 static void 58 cleanupConvData(ConvData *data) { 59 if(data!=nullptr) { 60 if(data->cnvData!=nullptr) { 61 data->cnvData->close(data->cnvData); 62 data->cnvData=nullptr; 63 } 64 if(data->extData!=nullptr) { 65 data->extData->close(data->extData); 66 data->extData=nullptr; 67 } 68 ucm_close(data->ucm); 69 data->ucm=nullptr; 70 } 71 } 72 73 /* 74 * from ucnvstat.c - static prototypes of data-based converters 75 */ 76 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]; 77 78 /* 79 * Global - verbosity 80 */ 81 UBool VERBOSE = false; 82 UBool QUIET = false; 83 UBool SMALL = false; 84 UBool IGNORE_SISO_CHECK = false; 85 86 static void 87 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); 88 89 /* 90 * Set up the UNewData and write the converter.. 91 */ 92 static void 93 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); 94 95 UBool haveCopyright=true; 96 97 static UDataInfo dataInfo={ 98 sizeof(UDataInfo), 99 0, 100 101 U_IS_BIG_ENDIAN, 102 U_CHARSET_FAMILY, 103 sizeof(char16_t), 104 0, 105 106 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */ 107 {6, 2, 0, 0}, /* formatVersion */ 108 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ 109 }; 110 111 static void 112 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) 113 { 114 UNewDataMemory *mem = nullptr; 115 uint32_t sz2; 116 uint32_t size = 0; 117 int32_t tableType; 118 119 if(U_FAILURE(*status)) 120 { 121 return; 122 } 123 124 tableType=TABLE_NONE; 125 if(data->cnvData!=nullptr) { 126 tableType|=TABLE_BASE; 127 } 128 if(data->extData!=nullptr) { 129 tableType|=TABLE_EXT; 130 } 131 132 mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : nullptr, status); 133 134 if(U_FAILURE(*status)) 135 { 136 fprintf(stderr, "Couldn't create the udata %s.%s: %s\n", 137 cnvName, 138 "cnv", 139 u_errorName(*status)); 140 return; 141 } 142 143 if(VERBOSE) 144 { 145 printf("- Opened udata %s.%s\n", cnvName, "cnv"); 146 } 147 148 149 /* all read only, clean, platform independent data. Mmmm. :) */ 150 udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); 151 size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ 152 /* Now, write the table */ 153 if(tableType&TABLE_BASE) { 154 size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); 155 } 156 if(tableType&TABLE_EXT) { 157 size += data->extData->write(data->extData, &data->staticData, mem, tableType); 158 } 159 160 sz2 = udata_finish(mem, status); 161 if(size != sz2) 162 { 163 fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", static_cast<int>(sz2), static_cast<int>(size)); 164 *status=U_INTERNAL_PROGRAM_ERROR; 165 } 166 if(VERBOSE) 167 { 168 printf("- Wrote %u bytes to the udata.\n", static_cast<int>(sz2)); 169 } 170 } 171 172 enum { 173 OPT_HELP_H, 174 OPT_HELP_QUESTION_MARK, 175 OPT_COPYRIGHT, 176 OPT_VERSION, 177 OPT_DESTDIR, 178 OPT_VERBOSE, 179 OPT_SMALL, 180 OPT_IGNORE_SISO_CHECK, 181 OPT_QUIET, 182 OPT_SOURCEDIR, 183 184 OPT_COUNT 185 }; 186 187 static UOption options[]={ 188 UOPTION_HELP_H, 189 UOPTION_HELP_QUESTION_MARK, 190 UOPTION_COPYRIGHT, 191 UOPTION_VERSION, 192 UOPTION_DESTDIR, 193 UOPTION_VERBOSE, 194 { "small", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0 }, 195 { "ignore-siso-check", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0 }, 196 UOPTION_QUIET, 197 UOPTION_SOURCEDIR, 198 }; 199 200 int main(int argc, char* argv[]) 201 { 202 ConvData data; 203 char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; 204 205 U_MAIN_INIT_ARGS(argc, argv); 206 207 /* Set up the ICU version number */ 208 UVersionInfo icuVersion; 209 u_getVersion(icuVersion); 210 uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); 211 212 /* preset then read command line options */ 213 options[OPT_DESTDIR].value=u_getDataDirectory(); 214 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 215 216 if(options[OPT_VERSION].doesOccur) { 217 printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n", 218 dataInfo.formatVersion[0], dataInfo.formatVersion[1]); 219 printf("%s\n", U_COPYRIGHT_STRING); 220 exit(0); 221 } 222 223 /* error handling, printing usage message */ 224 if(argc<0) { 225 fprintf(stderr, 226 "error in command line argument \"%s\"\n", 227 argv[-argc]); 228 } else if(argc<2) { 229 argc=-1; 230 } 231 if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { 232 FILE *stdfile=argc<0 ? stderr : stdout; 233 fprintf(stdfile, 234 "usage: %s [-options] files...\n" 235 "\tread .ucm codepage mapping files and write .cnv files\n" 236 "options:\n" 237 "\t-h or -? or --help this usage text\n" 238 "\t-V or --version show a version message\n" 239 "\t-c or --copyright include a copyright notice\n" 240 "\t-d or --destdir destination directory, followed by the path\n" 241 "\t-v or --verbose Turn on verbose output\n" 242 "\t-q or --quiet do not display warnings and progress\n" 243 "\t-s or --sourcedir source directory, followed by the path\n", 244 argv[0]); 245 fprintf(stdfile, 246 "\t --small Generate smaller .cnv files. They will be\n" 247 "\t significantly smaller but may not be compatible with\n" 248 "\t older versions of ICU and will require heap memory\n" 249 "\t allocation when loaded.\n" 250 "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n"); 251 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 252 } 253 254 /* get the options values */ 255 haveCopyright = options[OPT_COPYRIGHT].doesOccur; 256 const char *destdir = options[OPT_DESTDIR].value; 257 VERBOSE = options[OPT_VERBOSE].doesOccur; 258 QUIET = options[OPT_QUIET].doesOccur; 259 SMALL = options[OPT_SMALL].doesOccur; 260 261 if (options[OPT_IGNORE_SISO_CHECK].doesOccur) { 262 IGNORE_SISO_CHECK = true; 263 } 264 265 icu::CharString outFileName; 266 UErrorCode err = U_ZERO_ERROR; 267 if (destdir != nullptr && *destdir != 0) { 268 outFileName.append(destdir, err).ensureEndsWithFileSeparator(err); 269 if (U_FAILURE(err)) { 270 return err; 271 } 272 } 273 int32_t outBasenameStart = outFileName.length(); 274 275 #if DEBUG 276 { 277 int i; 278 printf("makeconv: processing %d files...\n", argc - 1); 279 for(i=1; i<argc; ++i) { 280 printf("%s ", argv[i]); 281 } 282 printf("\n"); 283 fflush(stdout); 284 } 285 #endif 286 287 UBool printFilename = static_cast<UBool>(argc > 2 || VERBOSE); 288 icu::CharString pathBuf; 289 for (++argv; --argc; ++argv) 290 { 291 UErrorCode localError = U_ZERO_ERROR; 292 const char *arg = getLongPathname(*argv); 293 294 const char* sourcedir = options[OPT_SOURCEDIR].value; 295 if (sourcedir != nullptr && *sourcedir != 0 && uprv_strcmp(sourcedir, ".") != 0) { 296 pathBuf.clear(); 297 pathBuf.appendPathPart(sourcedir, localError); 298 pathBuf.appendPathPart(arg, localError); 299 arg = pathBuf.data(); 300 } 301 302 /*produces the right destination path for display*/ 303 outFileName.truncate(outBasenameStart); 304 if (outBasenameStart != 0) 305 { 306 /* find the last file sepator */ 307 const char *basename = findBasename(arg); 308 outFileName.append(basename, localError); 309 } 310 else 311 { 312 outFileName.append(arg, localError); 313 } 314 if (U_FAILURE(localError)) { 315 return localError; 316 } 317 318 /*removes the extension if any is found*/ 319 int32_t lastDotIndex = outFileName.lastIndexOf('.'); 320 if (lastDotIndex >= outBasenameStart) { 321 outFileName.truncate(lastDotIndex); 322 } 323 324 /* the basename without extension is the converter name */ 325 if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) { 326 fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart); 327 return U_BUFFER_OVERFLOW_ERROR; 328 } 329 uprv_strcpy(cnvName, outFileName.data() + outBasenameStart); 330 331 /*Adds the target extension*/ 332 outFileName.append(CONVERTER_FILE_EXTENSION, localError); 333 if (U_FAILURE(localError)) { 334 return localError; 335 } 336 337 #if DEBUG 338 printf("makeconv: processing %s ...\n", arg); 339 fflush(stdout); 340 #endif 341 initConvData(&data); 342 createConverter(&data, arg, &localError); 343 344 if (U_FAILURE(localError)) 345 { 346 /* if an error is found, print out an error msg and keep going */ 347 fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", 348 outFileName.data(), arg, u_errorName(localError)); 349 if(U_SUCCESS(err)) { 350 err = localError; 351 } 352 } 353 else 354 { 355 /* Insure the static data name matches the file name */ 356 /* Changed to ignore directory and only compare base name 357 LDH 1/2/08*/ 358 char *p; 359 p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */ 360 361 if(p == nullptr) /* OK, try alternate */ 362 { 363 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR); 364 if(p == nullptr) 365 { 366 p=cnvName; /* If no separators, no problem */ 367 } 368 } 369 else 370 { 371 p++; /* If found separator, don't include it in compare */ 372 } 373 if(uprv_stricmp(p,data.staticData.name) && !QUIET) 374 { 375 fprintf(stderr, "Warning: %s%s claims to be '%s'\n", 376 cnvName, CONVERTER_FILE_EXTENSION, 377 data.staticData.name); 378 } 379 380 if (strlen(cnvName) + 1 > UPRV_LENGTHOF(data.staticData.name)) { 381 fprintf(stderr, "converter name %s too long\n", cnvName); 382 return U_BUFFER_OVERFLOW_ERROR; 383 } 384 uprv_strcpy((char*)data.staticData.name, cnvName); 385 386 if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { 387 fprintf(stderr, 388 "Error: A converter name must contain only invariant characters.\n" 389 "%s is not a valid converter name.\n", 390 data.staticData.name); 391 if(U_SUCCESS(err)) { 392 err = U_INVALID_TABLE_FORMAT; 393 } 394 } 395 396 localError = U_ZERO_ERROR; 397 writeConverterData(&data, cnvName, destdir, &localError); 398 399 if(U_FAILURE(localError)) 400 { 401 /* if an error is found, print out an error msg and keep going*/ 402 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg, 403 u_errorName(localError)); 404 if(U_SUCCESS(err)) { 405 err = localError; 406 } 407 } 408 else if (printFilename) 409 { 410 puts(outFileName.data() + outBasenameStart); 411 } 412 } 413 fflush(stdout); 414 fflush(stderr); 415 416 cleanupConvData(&data); 417 } 418 419 return err; 420 } 421 422 static void 423 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { 424 if( (name[0]=='i' || name[0]=='I') && 425 (name[1]=='b' || name[1]=='B') && 426 (name[2]=='m' || name[2]=='M') 427 ) { 428 name+=3; 429 if(*name=='-') { 430 ++name; 431 } 432 *pPlatform=UCNV_IBM; 433 *pCCSID = static_cast<int32_t>(uprv_strtoul(name, nullptr, 10)); 434 } else { 435 *pPlatform=UCNV_UNKNOWN; 436 *pCCSID=0; 437 } 438 } 439 440 static void 441 readHeader(ConvData *data, 442 FileStream* convFile, 443 UErrorCode *pErrorCode) { 444 char line[1024]; 445 char *s, *key, *value; 446 const UConverterStaticData *prototype; 447 UConverterStaticData *staticData; 448 449 if(U_FAILURE(*pErrorCode)) { 450 return; 451 } 452 453 staticData=&data->staticData; 454 staticData->platform=UCNV_IBM; 455 staticData->subCharLen=0; 456 457 while(T_FileStream_readLine(convFile, line, sizeof(line))) { 458 /* basic parsing and handling of state-related items */ 459 if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { 460 continue; 461 } 462 463 /* stop at the beginning of the mapping section */ 464 if(uprv_strcmp(line, "CHARMAP")==0) { 465 break; 466 } 467 468 /* collect the information from the header field, ignore unknown keys */ 469 if(uprv_strcmp(key, "code_set_name")==0) { 470 if(*value!=0) { 471 uprv_strcpy((char *)staticData->name, value); 472 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); 473 } 474 } else if(uprv_strcmp(key, "subchar")==0) { 475 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 476 int8_t length; 477 478 s=value; 479 length=ucm_parseBytes(bytes, line, (const char **)&s); 480 if(1<=length && length<=4 && *s==0) { 481 staticData->subCharLen=length; 482 uprv_memcpy(staticData->subChar, bytes, length); 483 } else { 484 fprintf(stderr, "error: illegal <subchar> %s\n", value); 485 *pErrorCode=U_INVALID_TABLE_FORMAT; 486 return; 487 } 488 } else if(uprv_strcmp(key, "subchar1")==0) { 489 uint8_t bytes[UCNV_EXT_MAX_BYTES]; 490 491 s=value; 492 if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) { 493 staticData->subChar1=bytes[0]; 494 } else { 495 fprintf(stderr, "error: illegal <subchar1> %s\n", value); 496 *pErrorCode=U_INVALID_TABLE_FORMAT; 497 return; 498 } 499 } 500 } 501 502 /* copy values from the UCMFile to the static data */ 503 staticData->maxBytesPerChar = static_cast<int8_t>(data->ucm->states.maxCharLength); 504 staticData->minBytesPerChar = static_cast<int8_t>(data->ucm->states.minCharLength); 505 staticData->conversionType=data->ucm->states.conversionType; 506 507 if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { 508 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); 509 *pErrorCode=U_INVALID_TABLE_FORMAT; 510 return; 511 } 512 513 /* 514 * Now that we know the type, copy any 'default' values from the table. 515 * We need not check the type any further because the parser only 516 * recognizes what we have prototypes for. 517 * 518 * For delta (extension-only) tables, copy values from the base file 519 * instead, see createConverter(). 520 */ 521 if(data->ucm->baseName[0]==0) { 522 prototype=ucnv_converterStaticData[staticData->conversionType]; 523 if(prototype!=nullptr) { 524 if(staticData->name[0]==0) { 525 uprv_strcpy((char *)staticData->name, prototype->name); 526 } 527 528 if(staticData->codepage==0) { 529 staticData->codepage=prototype->codepage; 530 } 531 532 if(staticData->platform==0) { 533 staticData->platform=prototype->platform; 534 } 535 536 if(staticData->minBytesPerChar==0) { 537 staticData->minBytesPerChar=prototype->minBytesPerChar; 538 } 539 540 if(staticData->maxBytesPerChar==0) { 541 staticData->maxBytesPerChar=prototype->maxBytesPerChar; 542 } 543 544 if(staticData->subCharLen==0) { 545 staticData->subCharLen=prototype->subCharLen; 546 if(prototype->subCharLen>0) { 547 uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); 548 } 549 } 550 } 551 } 552 553 if(data->ucm->states.outputType<0) { 554 data->ucm->states.outputType = static_cast<int8_t>(data->ucm->states.maxCharLength) - 1; 555 } 556 557 if( staticData->subChar1!=0 && 558 (staticData->minBytesPerChar>1 || 559 (staticData->conversionType!=UCNV_MBCS && 560 staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) 561 ) { 562 fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n"); 563 *pErrorCode=U_INVALID_TABLE_FORMAT; 564 } 565 } 566 567 /* return true if a base table was read, false for an extension table */ 568 static UBool 569 readFile(ConvData *data, const char* converterName, 570 UErrorCode *pErrorCode) { 571 char line[1024]; 572 char *end; 573 FileStream *convFile; 574 575 UCMStates *baseStates; 576 UBool dataIsBase; 577 578 if(U_FAILURE(*pErrorCode)) { 579 return false; 580 } 581 582 data->ucm=ucm_open(); 583 584 convFile=T_FileStream_open(converterName, "r"); 585 if(convFile==nullptr) { 586 *pErrorCode=U_FILE_ACCESS_ERROR; 587 return false; 588 } 589 590 readHeader(data, convFile, pErrorCode); 591 if(U_FAILURE(*pErrorCode)) { 592 return false; 593 } 594 595 if(data->ucm->baseName[0]==0) { 596 dataIsBase=true; 597 baseStates=&data->ucm->states; 598 ucm_processStates(baseStates, IGNORE_SISO_CHECK); 599 } else { 600 dataIsBase=false; 601 baseStates=nullptr; 602 } 603 604 /* read the base table */ 605 ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); 606 if(U_FAILURE(*pErrorCode)) { 607 return false; 608 } 609 610 /* read an extension table if there is one */ 611 while(T_FileStream_readLine(convFile, line, sizeof(line))) { 612 end=uprv_strchr(line, 0); 613 while(line<end && 614 (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) { 615 --end; 616 } 617 *end=0; 618 619 if(line[0]=='#' || u_skipWhitespace(line)==end) { 620 continue; /* ignore empty and comment lines */ 621 } 622 623 if(0==uprv_strcmp(line, "CHARMAP")) { 624 /* read the extension table */ 625 ucm_readTable(data->ucm, convFile, false, baseStates, pErrorCode); 626 } else { 627 fprintf(stderr, "unexpected text after the base mapping table\n"); 628 } 629 break; 630 } 631 632 T_FileStream_close(convFile); 633 634 if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { 635 fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); 636 *pErrorCode=U_INVALID_TABLE_FORMAT; 637 } 638 639 return dataIsBase; 640 } 641 642 static void 643 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { 644 ConvData baseData; 645 UBool dataIsBase; 646 647 UConverterStaticData *staticData; 648 UCMStates *states, *baseStates; 649 650 if(U_FAILURE(*pErrorCode)) { 651 return; 652 } 653 654 initConvData(data); 655 656 dataIsBase=readFile(data, converterName, pErrorCode); 657 if(U_FAILURE(*pErrorCode)) { 658 return; 659 } 660 661 staticData=&data->staticData; 662 states=&data->ucm->states; 663 664 if(dataIsBase) { 665 /* 666 * Build a normal .cnv file with a base table 667 * and an optional extension table. 668 */ 669 data->cnvData=MBCSOpen(data->ucm); 670 if(data->cnvData==nullptr) { 671 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 672 673 } else if(!data->cnvData->isValid(data->cnvData, 674 staticData->subChar, staticData->subCharLen) 675 ) { 676 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); 677 *pErrorCode=U_INVALID_TABLE_FORMAT; 678 679 } else if(staticData->subChar1!=0 && 680 !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1) 681 ) { 682 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); 683 *pErrorCode=U_INVALID_TABLE_FORMAT; 684 685 } else if( 686 data->ucm->ext->mappingsLength>0 && 687 !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, 0) 688 ) { 689 *pErrorCode=U_INVALID_TABLE_FORMAT; 690 } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) { 691 /* sort the table so that it can be turned into UTF-8-friendly data */ 692 ucm_sortTable(data->ucm->base); 693 } 694 695 if(U_SUCCESS(*pErrorCode)) { 696 if( 697 /* add the base table after ucm_checkBaseExt()! */ 698 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) 699 ) { 700 *pErrorCode=U_INVALID_TABLE_FORMAT; 701 } else { 702 /* 703 * addTable() may have requested moving more mappings to the extension table 704 * if they fit into the base toUnicode table but not into the 705 * base fromUnicode table. 706 * (Especially for UTF-8-friendly fromUnicode tables.) 707 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them 708 * to be excluded from the extension toUnicode data. 709 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into 710 * the base fromUnicode table. 711 */ 712 ucm_moveMappings(data->ucm->base, data->ucm->ext); 713 ucm_sortTable(data->ucm->ext); 714 if(data->ucm->ext->mappingsLength>0) { 715 /* prepare the extension table, if there is one */ 716 data->extData=CnvExtOpen(data->ucm); 717 if(data->extData==nullptr) { 718 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 719 } else if( 720 !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) 721 ) { 722 *pErrorCode=U_INVALID_TABLE_FORMAT; 723 } 724 } 725 } 726 } 727 } else { 728 /* Build an extension-only .cnv file. */ 729 char baseFilename[500]; 730 char *basename; 731 732 initConvData(&baseData); 733 734 /* assemble a path/filename for data->ucm->baseName */ 735 uprv_strcpy(baseFilename, converterName); 736 basename = const_cast<char*>(findBasename(baseFilename)); 737 uprv_strcpy(basename, data->ucm->baseName); 738 uprv_strcat(basename, ".ucm"); 739 740 /* read the base table */ 741 dataIsBase=readFile(&baseData, baseFilename, pErrorCode); 742 if(U_FAILURE(*pErrorCode)) { 743 return; 744 } else if(!dataIsBase) { 745 fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename); 746 *pErrorCode=U_INVALID_TABLE_FORMAT; 747 } else { 748 /* prepare the extension table */ 749 data->extData=CnvExtOpen(data->ucm); 750 if(data->extData==nullptr) { 751 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 752 } else { 753 /* fill in gaps in extension file header fields */ 754 UCMapping *m, *mLimit; 755 uint8_t fallbackFlags; 756 757 baseStates=&baseData.ucm->states; 758 if(states->conversionType==UCNV_DBCS) { 759 staticData->minBytesPerChar = static_cast<int8_t>(states->minCharLength = 2); 760 } else if(states->minCharLength==0) { 761 staticData->minBytesPerChar = static_cast<int8_t>(states->minCharLength = baseStates->minCharLength); 762 } 763 if(states->maxCharLength<states->minCharLength) { 764 staticData->maxBytesPerChar = static_cast<int8_t>(states->maxCharLength = baseStates->maxCharLength); 765 } 766 767 if(staticData->subCharLen==0) { 768 uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4); 769 staticData->subCharLen=baseData.staticData.subCharLen; 770 } 771 /* 772 * do not copy subChar1 - 773 * only use what is explicitly specified 774 * because it cannot be unset in the extension file header 775 */ 776 777 /* get the fallback flags */ 778 fallbackFlags=0; 779 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; 780 m<mLimit && fallbackFlags!=3; 781 ++m 782 ) { 783 if(m->f==1) { 784 fallbackFlags|=1; 785 } else if(m->f==3) { 786 fallbackFlags|=2; 787 } 788 } 789 790 if(fallbackFlags&1) { 791 staticData->hasFromUnicodeFallback=true; 792 } 793 if(fallbackFlags&2) { 794 staticData->hasToUnicodeFallback=true; 795 } 796 797 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) { 798 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); 799 *pErrorCode=U_INVALID_TABLE_FORMAT; 800 801 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { 802 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); 803 *pErrorCode=U_INVALID_TABLE_FORMAT; 804 805 } else if( 806 !ucm_checkValidity(data->ucm->ext, baseStates) || 807 !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, 0) 808 ) { 809 *pErrorCode=U_INVALID_TABLE_FORMAT; 810 } else { 811 if(states->maxCharLength>1) { 812 /* 813 * When building a normal .cnv file with a base table 814 * for an MBCS (not SBCS) table with explicit precision flags, 815 * the MBCSAddTable() function marks some mappings for moving 816 * to the extension table. 817 * They fit into the base toUnicode table but not into the 818 * base fromUnicode table. 819 * (Note: We do have explicit precision flags because they are 820 * required for extension table generation, and 821 * ucm_checkBaseExt() verified it.) 822 * 823 * We do not call MBCSAddTable() here (we probably could) 824 * so we need to do the analysis before building the extension table. 825 * We assume that MBCSAddTable() will build a UTF-8-friendly table. 826 * Redundant mappings in the extension table are ok except they cost some size. 827 * 828 * Do this after ucm_checkBaseExt(). 829 */ 830 const MBCSData *mbcsData=MBCSGetDummy(); 831 int32_t needsMove=0; 832 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; 833 m<mLimit; 834 ++m 835 ) { 836 if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) { 837 m->f|=MBCS_FROM_U_EXT_FLAG; 838 m->moveFlag=UCM_MOVE_TO_EXT; 839 ++needsMove; 840 } 841 } 842 843 if(needsMove!=0) { 844 ucm_moveMappings(baseData.ucm->base, data->ucm->ext); 845 ucm_sortTable(data->ucm->ext); 846 } 847 } 848 if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) { 849 *pErrorCode=U_INVALID_TABLE_FORMAT; 850 } 851 } 852 } 853 } 854 855 cleanupConvData(&baseData); 856 } 857 } 858 859 /* 860 * Hey, Emacs, please set the following: 861 * 862 * Local Variables: 863 * indent-tabs-mode: nil 864 * End: 865 * 866 */