[ tor-browser ].git.dasho

gennorm2.cpp (12174B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  gennorm2.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov25
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This program reads text files that define Unicode normalization,
     19 *   parses them, and builds a binary data file.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "n2builder.h"
     24 
     25 #include <fstream>
     26 #include <stdio.h>
     27 #include <stdlib.h>
     28 #include <string>
     29 #include <string.h>
     30 #include "unicode/errorcode.h"
     31 #include "unicode/localpointer.h"
     32 #include "unicode/putil.h"
     33 #include "unicode/uchar.h"
     34 #include "unicode/unistr.h"
     35 #include "charstr.h"
     36 #include "normalizer2impl.h"
     37 #include "toolutil.h"
     38 #include "uoptions.h"
     39 #include "uparse.h"
     40 
     41 #if UCONFIG_NO_NORMALIZATION
     42 #include "unewdata.h"
     43 #endif
     44 
     45 U_NAMESPACE_BEGIN
     46 
     47 UBool beVerbose=false, haveCopyright=true;
     48 
     49 #if !UCONFIG_NO_NORMALIZATION
     50 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder);
     51 #endif
     52 
     53 /* -------------------------------------------------------------------------- */
     54 
     55 enum {
     56    HELP_H,
     57    HELP_QUESTION_MARK,
     58    VERBOSE,
     59    COPYRIGHT,
     60    SOURCEDIR,
     61    OUTPUT_FILENAME,
     62    UNICODE_VERSION,
     63    WRITE_C_SOURCE,
     64    WRITE_COMBINED_DATA,
     65    OPT_FAST
     66 };
     67 
     68 static UOption options[]={
     69    UOPTION_HELP_H,
     70    UOPTION_HELP_QUESTION_MARK,
     71    UOPTION_VERBOSE,
     72    UOPTION_COPYRIGHT,
     73    UOPTION_SOURCEDIR,
     74    UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
     75    UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
     76    UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
     77    UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
     78    UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
     79 };
     80 
     81 U_NAMESPACE_END
     82 
     83 int
     84 main(int argc, char* argv[]) {
     85    U_NAMESPACE_USE
     86    U_MAIN_INIT_ARGS(argc, argv);
     87 
     88    /* preset then read command line options */
     89    options[SOURCEDIR].value="";
     90    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
     91 
     92    /* error handling, printing usage message */
     93    if(argc<0) {
     94        fprintf(stderr,
     95            "error in command line argument \"%s\"\n",
     96            argv[-argc]);
     97    }
     98    if(!options[OUTPUT_FILENAME].doesOccur) {
     99        argc=-1;
    100    }
    101    if( argc<2 ||
    102        options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
    103    ) {
    104        fprintf(stderr,
    105            "Usage: %s [-options] infiles+ -o outputfilename\n"
    106            "\n"
    107            "Reads the infiles with normalization data and\n"
    108            "creates a binary file, or a C source file (--csource), with the data,\n"
    109            "or writes a data file with the combined data (--combined).\n"
    110            "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n"
    111            "\n"
    112            "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
    113            "\n"
    114            "Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
    115            "in input-file syntax to the outputfilename.\n"
    116            "It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
    117            "(Useful for computing minimal incremental mapping data files.)\n"
    118            "\n",
    119            argv[0], argv[0]);
    120        fprintf(stderr,
    121            "Options:\n"
    122            "\t-h or -? or --help  this usage text\n"
    123            "\t-v or --verbose     verbose output\n"
    124            "\t-c or --copyright   include a copyright notice\n"
    125            "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
    126        fprintf(stderr,
    127            "\t-s or --sourcedir   source directory, followed by the path\n"
    128            "\t-o or --output      output filename\n"
    129            "\t      --csource     writes a C source file with initializers\n"
    130            "\t      --combined    writes a .txt file (input-file syntax) with the\n"
    131            "\t                    combined data from all of the input files\n");
    132        fprintf(stderr,
    133            "\t      --fast        optimize the data for fast normalization,\n"
    134            "\t                    which might increase its size  (Writes fully decomposed\n"
    135            "\t                    regular mappings instead of delta mappings.\n"
    136            "\t                    You should measure the runtime speed to make sure that\n"
    137            "\t                    this is a good trade-off.)\n");
    138        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    139    }
    140 
    141    beVerbose=options[VERBOSE].doesOccur;
    142    haveCopyright=options[COPYRIGHT].doesOccur;
    143 
    144    IcuToolErrorCode errorCode("gennorm2/main()");
    145 
    146 #if UCONFIG_NO_NORMALIZATION
    147 
    148    fprintf(stderr,
    149        "gennorm2 writes a dummy binary data file "
    150        "because UCONFIG_NO_NORMALIZATION is set, \n"
    151        "see icu/source/common/unicode/uconfig.h\n");
    152    udata_createDummy(nullptr, nullptr, options[OUTPUT_FILENAME].value, errorCode);
    153    // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
    154    // return U_UNSUPPORTED_ERROR;
    155    return 0;
    156 
    157 #else
    158 
    159    LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
    160    LocalPointer<Normalizer2DataBuilder> b2;
    161    LocalPointer<Normalizer2DataBuilder> diff;
    162    Normalizer2DataBuilder *builder = b1.getAlias();
    163    errorCode.assertSuccess();
    164 
    165    if(options[UNICODE_VERSION].doesOccur) {
    166        builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    167    }
    168 
    169    if(options[OPT_FAST].doesOccur) {
    170        builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    171    }
    172 
    173    // prepare the filename beginning with the source dir
    174    CharString filename(options[SOURCEDIR].value, errorCode);
    175    int32_t pathLength=filename.length();
    176    if( pathLength>0 &&
    177        filename[pathLength-1]!=U_FILE_SEP_CHAR &&
    178        filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
    179    ) {
    180        filename.append(U_FILE_SEP_CHAR, errorCode);
    181        pathLength=filename.length();
    182    }
    183 
    184    bool doMinus = false;
    185    for(int i=1; i<argc; ++i) {
    186        printf("gennorm2: processing %s\n", argv[i]);
    187        if(strcmp(argv[i], "minus") == 0) {
    188            if(doMinus) {
    189                fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
    190                exit(U_ILLEGAL_ARGUMENT_ERROR);
    191            }
    192            // Data from previous input files has been collected in b1.
    193            // Collect data from further input files in b2.
    194            b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
    195            diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
    196            errorCode.assertSuccess();
    197            builder = b2.getAlias();
    198            if(options[UNICODE_VERSION].doesOccur) {
    199                builder->setUnicodeVersion(options[UNICODE_VERSION].value);
    200            }
    201            if(options[OPT_FAST].doesOccur) {
    202                builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
    203            }
    204            doMinus = true;
    205            continue;
    206        }
    207        filename.append(argv[i], errorCode);
    208        std::ifstream f(filename.data());
    209        if(f.fail()) {
    210            fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
    211            exit(U_FILE_ACCESS_ERROR);
    212        }
    213        builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
    214        parseFile(f, *builder);
    215        filename.truncate(pathLength);
    216    }
    217 
    218    if(doMinus) {
    219        Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
    220        diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
    221    } else if(options[WRITE_COMBINED_DATA].doesOccur) {
    222        builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
    223    } else if(options[WRITE_C_SOURCE].doesOccur) {
    224        builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
    225    } else {
    226        builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
    227    }
    228 
    229    return errorCode.get();
    230 
    231 #endif
    232 }
    233 
    234 U_NAMESPACE_BEGIN
    235 
    236 #if !UCONFIG_NO_NORMALIZATION
    237 
    238 void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) {
    239    IcuToolErrorCode errorCode("gennorm2/parseFile()");
    240    std::string lineString;
    241    uint32_t startCP, endCP;
    242    while(std::getline(f, lineString)) {
    243        if (lineString.empty()) {
    244            continue;  // skip empty lines.
    245        }
    246        char *line = &lineString.front();
    247        char* comment = strchr(line, '#');
    248        if(comment!=nullptr) {
    249            *comment=0;
    250        }
    251        u_rtrim(line);
    252        if(line[0]==0) {
    253            continue;  // skip empty and comment-only lines
    254        }
    255        if(line[0]=='*') {
    256            const char *s=u_skipWhitespace(line+1);
    257            if(0==strncmp(s, "Unicode", 7)) {
    258                s=u_skipWhitespace(s+7);
    259                builder.setUnicodeVersion(s);
    260            }
    261            continue;  // reserved syntax
    262        }
    263        const char *delimiter;
    264        int32_t rangeLength=
    265            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
    266        if(errorCode.isFailure()) {
    267            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
    268            exit(errorCode.reset());
    269        }
    270        if (endCP >= 0xd800 && startCP <= 0xdfff) {
    271                fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n",
    272                        line);
    273                exit(U_ILLEGAL_ARGUMENT_ERROR);
    274        }
    275        delimiter=u_skipWhitespace(delimiter);
    276        if(*delimiter==':') {
    277            const char *s=u_skipWhitespace(delimiter+1);
    278            char *end;
    279            unsigned long value=strtoul(s, &end, 10);
    280            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
    281                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
    282                exit(U_PARSE_ERROR);
    283            }
    284            for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) {
    285                builder.setCC(c, static_cast<uint8_t>(value));
    286            }
    287            continue;
    288        }
    289        if(*delimiter=='-') {
    290            if(*u_skipWhitespace(delimiter+1)!=0) {
    291                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
    292                exit(U_PARSE_ERROR);
    293            }
    294            for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) {
    295                builder.removeMapping(c);
    296            }
    297            continue;
    298        }
    299        if(*delimiter=='=' || *delimiter=='>') {
    300            char16_t uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
    301            int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), nullptr, errorCode);
    302            if(errorCode.isFailure()) {
    303                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
    304                exit(errorCode.reset());
    305            }
    306            UnicodeString mapping(false, uchars, length);
    307            if(*delimiter=='=') {
    308                if(rangeLength!=1) {
    309                    fprintf(stderr,
    310                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
    311                            line);
    312                    exit(U_PARSE_ERROR);
    313                }
    314                builder.setRoundTripMapping(static_cast<UChar32>(startCP), mapping);
    315            } else {
    316                for (UChar32 c = static_cast<UChar32>(startCP); c <= static_cast<UChar32>(endCP); ++c) {
    317                    builder.setOneWayMapping(c, mapping);
    318                }
    319            }
    320            continue;
    321        }
    322        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
    323        exit(U_PARSE_ERROR);
    324    }
    325 }
    326 
    327 #endif // !UCONFIG_NO_NORMALIZATION
    328 
    329 U_NAMESPACE_END
    330 
    331 /*
    332 * Hey, Emacs, please set the following:
    333 *
    334 * Local Variables:
    335 * indent-tabs-mode: nil
    336 * End:
    337 *
    338 */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE