tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

genbrk.cpp (11267B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File genbrk.c
     10 */
     11 
     12 //--------------------------------------------------------------------
     13 //
     14 //   Tool for generating RuleBasedBreakIterator data files (.brk files).
     15 //   .brk files contain the precompiled rules for standard types
     16 //   of iterators - word, line, sentence, etc.
     17 //
     18 //   Usage:  genbrk [options] -r rule-file.txt  -o output-file.brk
     19 //
     20 //       options:   -v         verbose
     21 //                  -? or -h   help
     22 //
     23 //   The input rule file is a plain text file containing break rules
     24 //    in the input format accepted by RuleBasedBreakIterators.  The
     25 //    file can be encoded as UTF-8 or UTF-16 (either endian).  Files
     26 //    encoded as UTF-16 must include a BOM.
     27 //
     28 //--------------------------------------------------------------------
     29 
     30 #include "unicode/utypes.h"
     31 #include "unicode/ucnv.h"
     32 #include "unicode/unistr.h"
     33 #include "unicode/rbbi.h"
     34 #include "unicode/uclean.h"
     35 #include "unicode/udata.h"
     36 #include "unicode/putil.h"
     37 
     38 #include "uoptions.h"
     39 #include "unewdata.h"
     40 #include "ucmndata.h"
     41 #include "rbbidata.h"
     42 #include "cmemory.h"
     43 
     44 #include <stdio.h>
     45 #include <stdlib.h>
     46 #include <string.h>
     47 
     48 U_NAMESPACE_USE
     49 
     50 static char *progName;
     51 static UOption options[]={
     52    UOPTION_HELP_H,             /* 0 */
     53    UOPTION_HELP_QUESTION_MARK, /* 1 */
     54    UOPTION_VERBOSE,            /* 2 */
     55    { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     56    { "out",   nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 },   /* 4 */
     57    UOPTION_ICUDATADIR,         /* 5 */
     58    UOPTION_DESTDIR,            /* 6 */
     59    UOPTION_COPYRIGHT,          /* 7 */
     60    UOPTION_QUIET,              /* 8 */
     61 };
     62 
     63 void usageAndDie(int retCode) {
     64        printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName);
     65        printf("\tRead in break iteration rules text and write out the binary data.\n"
     66            "\tIf the rule file does not have a Unicode signature byte sequence, it is assumed\n"
     67            "\tto be UTF-8.\n"
     68            "options:\n"
     69            "\t-h or -? or --help  this usage text\n"
     70            "\t-V or --version     show a version message\n"
     71            "\t-c or --copyright   include a copyright notice\n"
     72            "\t-v or --verbose     turn on verbose output\n"
     73            "\t-q or --quiet       do not display warnings and progress\n"
     74            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     75            "\t                    followed by path, defaults to %s\n"
     76            "\t-d or --destdir     destination directory, followed by the path\n",
     77            u_getDataDirectory());
     78        exit (retCode);
     79 }
     80 
     81 
     82 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
     83 
     84 /* dummy UDataInfo cf. udata.h */
     85 static UDataInfo dummyDataInfo = {
     86    sizeof(UDataInfo),
     87    0,
     88 
     89    U_IS_BIG_ENDIAN,
     90    U_CHARSET_FAMILY,
     91    U_SIZEOF_UCHAR,
     92    0,
     93 
     94    { 0, 0, 0, 0 },                 /* dummy dataFormat */
     95    { 0, 0, 0, 0 },                 /* dummy formatVersion */
     96    { 0, 0, 0, 0 }                  /* dummy dataVersion */
     97 };
     98 
     99 #else
    100 
    101 //
    102 //  Set up the ICU data header, defined in ucmndata.h
    103 //
    104 DataHeader dh ={
    105    {sizeof(DataHeader),           // Struct MappedData
    106        0xda,
    107        0x27},
    108 
    109    {                               // struct UDataInfo
    110        sizeof(UDataInfo),          //     size
    111        0,                          //     reserved
    112        U_IS_BIG_ENDIAN,
    113        U_CHARSET_FAMILY,
    114        U_SIZEOF_UCHAR,
    115        0,                          //     reserved
    116 
    117    { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
    118    { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    119                                    //      from the RBBI rule builder.  The  values declared
    120                                    //      here should never appear in any real RBBI data.
    121        { 4, 1, 0, 0 }              //   dataVersion (Unicode version)
    122    }};
    123 
    124 #endif
    125 
    126 //----------------------------------------------------------------------------
    127 //
    128 //  main      for genbrk
    129 //
    130 //----------------------------------------------------------------------------
    131 int  main(int argc, char **argv) {
    132    UErrorCode  status = U_ZERO_ERROR;
    133    const char *ruleFileName;
    134    const char *outFileName;
    135    const char *outDir = nullptr;
    136    const char *copyright = nullptr;
    137 
    138    //
    139    // Pick up and check the command line arguments,
    140    //    using the standard ICU tool utils option handling.
    141    //
    142    U_MAIN_INIT_ARGS(argc, argv);
    143    progName = argv[0];
    144    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    145    if(argc<0) {
    146        // Unrecognized option
    147        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    148        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    149    }
    150 
    151    if(options[0].doesOccur || options[1].doesOccur) {
    152        //  -? or -h for help.
    153        usageAndDie(0);
    154    }
    155 
    156    if (!(options[3].doesOccur && options[4].doesOccur)) {
    157        fprintf(stderr, "rule file and output file must both be specified.\n");
    158        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    159    }
    160    ruleFileName = options[3].value;
    161    outFileName  = options[4].value;
    162 
    163    if (options[5].doesOccur) {
    164        u_setDataDirectory(options[5].value);
    165    }
    166 
    167    status = U_ZERO_ERROR;
    168 
    169    /* Combine the directory with the file name */
    170    if(options[6].doesOccur) {
    171        outDir = options[6].value;
    172    }
    173    if (options[7].doesOccur) {
    174        copyright = U_COPYRIGHT_STRING;
    175    }
    176 
    177 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO
    178 
    179    UNewDataMemory *pData;
    180    char msg[1024];
    181 
    182    /* write message with just the name */
    183    snprintf(msg, sizeof(msg), "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    184    fprintf(stderr, "%s\n", msg);
    185 
    186    /* write the dummy data file */
    187    pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status);
    188    udata_writeBlock(pData, msg, strlen(msg));
    189    udata_finish(pData, &status);
    190    return (int)status;
    191 
    192 #else
    193    /* Initialize ICU */
    194    u_init(&status);
    195    if (U_FAILURE(status)) {
    196        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
    197            argv[0], u_errorName(status));
    198        exit(1);
    199    }
    200    status = U_ZERO_ERROR;
    201 
    202    //
    203    //  Read in the rule source file
    204    //
    205    long        result;
    206    long        ruleFileSize;
    207    FILE        *file;
    208    char        *ruleBufferC;
    209 
    210    file = fopen(ruleFileName, "rb");
    211    if (file == nullptr) {
    212        fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
    213        exit(-1);
    214    }
    215    fseek(file, 0, SEEK_END);
    216    ruleFileSize = ftell(file);
    217    fseek(file, 0, SEEK_SET);
    218    ruleBufferC = new char[ruleFileSize+10];
    219 
    220    result = static_cast<long>(fread(ruleBufferC, 1, ruleFileSize, file));
    221    if (result != ruleFileSize)  {
    222        fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
    223        exit (-1);
    224    }
    225    ruleBufferC[ruleFileSize]=0;
    226    fclose(file);
    227 
    228    //
    229    // Look for a Unicode Signature (BOM) on the rule file
    230    //
    231    int32_t        signatureLength;
    232    const char *   ruleSourceC = ruleBufferC;
    233    const char*    encoding = ucnv_detectUnicodeSignature(
    234                           ruleSourceC, ruleFileSize, &signatureLength, &status);
    235    if (U_FAILURE(status)) {
    236        exit(status);
    237    }
    238    if (encoding == nullptr) {
    239        // In the absence of a BOM, assume the rule file is in UTF-8.
    240        encoding = "UTF-8";
    241    } else {
    242        ruleSourceC  += signatureLength;
    243        ruleFileSize -= signatureLength;
    244    }
    245 
    246    //
    247    // Open a converter to take the rule file to UTF-16
    248    //
    249    UConverter* conv;
    250    conv = ucnv_open(encoding, &status);
    251    if (U_FAILURE(status)) {
    252        fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
    253        exit(status);
    254    }
    255 
    256    //
    257    // Convert the rules to char16_t.
    258    //  Preflight first to determine required buffer size.
    259    //
    260    uint32_t destCap = ucnv_toUChars(conv,
    261                       nullptr,           //  dest,
    262                       0,              //  destCapacity,
    263                       ruleSourceC,
    264                       ruleFileSize,
    265                       &status);
    266    if (status != U_BUFFER_OVERFLOW_ERROR) {
    267        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    268        exit(status);
    269    }
    270 
    271    status = U_ZERO_ERROR;
    272    char16_t *ruleSourceU = new char16_t[destCap+1];
    273    ucnv_toUChars(conv,
    274                  ruleSourceU,     //  dest,
    275                  destCap+1,
    276                  ruleSourceC,
    277                  ruleFileSize,
    278                  &status);
    279    if (U_FAILURE(status)) {
    280        fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    281        exit(status);
    282    }
    283    ucnv_close(conv);
    284 
    285 
    286    //
    287    //  Put the source rules into a UnicodeString
    288    //
    289    UnicodeString ruleSourceS(false, ruleSourceU, destCap);
    290 
    291    //
    292    //  Create the break iterator from the rules
    293    //     This will compile the rules.
    294    //
    295    UParseError parseError;
    296    parseError.line = 0;
    297    parseError.offset = 0;
    298    RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
    299    if (U_FAILURE(status)) {
    300        fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\"  at line %d, column %d\n",
    301                u_errorName(status), static_cast<int>(parseError.line), static_cast<int>(parseError.offset));
    302        exit(status);
    303    }
    304 
    305 
    306    //
    307    //  Get the compiled rule data from the break iterator.
    308    //
    309    uint32_t        outDataSize;
    310    const uint8_t  *outData;
    311    outData = bi->getBinaryRules(outDataSize);
    312 
    313    // Copy the data format version numbers from the RBBI data header into the UDataMemory header.
    314    uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion));
    315 
    316    //
    317    //  Create the output file
    318    //
    319    size_t bytesWritten;
    320    UNewDataMemory *pData;
    321    pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status);
    322    if(U_FAILURE(status)) {
    323        fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 
    324                         outFileName, u_errorName(status));
    325        exit(status);
    326    }
    327 
    328 
    329    //  Write the data itself.
    330    udata_writeBlock(pData, outData, outDataSize);
    331    // finish up 
    332    bytesWritten = udata_finish(pData, &status);
    333    if(U_FAILURE(status)) {
    334        fprintf(stderr, "genbrk: error %d writing the output file\n", status);
    335        exit(status);
    336    }
    337    
    338    if (bytesWritten != outDataSize) {
    339        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
    340        exit(-1);
    341    }
    342 
    343    delete bi;
    344    delete[] ruleSourceU;
    345    delete[] ruleBufferC;
    346    u_cleanup();
    347 
    348 
    349    if(!options[8].doesOccur) {
    350        printf("genbrk: tool completed successfully.\n");
    351    }
    352    return 0;
    353 
    354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    355 }