tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gencfu.cpp (10659B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2009-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *
      9 * File gencfu.c
     10 */
     11 
     12 //--------------------------------------------------------------------
     13 //
     14 //   Tool for generating Unicode Confusable data files (.cfu files).
     15 //   .cfu files contain the compiled of the confusable data
     16 //   derived from the Unicode Consortium data described in
     17 //   Unicode UAX 39.
     18 //
     19 //   Usage:  gencfu [options] -r confusables-file.txt -o output-file.cfu
     20 //
     21 //       options:   -v         verbose
     22 //                  -? or -h   help
     23 //
     24 //   The input rule file are plain text files containing confusable character
     25 //    definitions in the input format defined by Unicode UAX39 for the files
     26 //    confusables.txt.  This source (.txt) format
     27 //    is also accepted by ICU spoof detectors. The
     28 //    files must be encoded in utf-8 format, with or without a BOM.
     29 //
     30 //   The script used to compile confusablesWholeScript.txt into the CFU file
     31 //    until the Unicode consortium deprecated it.
     32 //
     33 //--------------------------------------------------------------------
     34 
     35 #include "unicode/utypes.h"
     36 #include "unicode/unistr.h"
     37 #include "unicode/uclean.h"
     38 #include "unicode/udata.h"
     39 #include "unicode/putil.h"
     40 
     41 #include "uoptions.h"
     42 #include "unewdata.h"
     43 #include "ucmndata.h"
     44 #include "uspoof_impl.h"
     45 #include "cmemory.h"
     46 
     47 #include <stdio.h>
     48 #include <stdlib.h>
     49 #include <string.h>
     50 
     51 U_NAMESPACE_USE
     52 
     53 static char *progName;
     54 static UOption options[]={
     55    UOPTION_HELP_H,             /* 0 */
     56    UOPTION_HELP_QUESTION_MARK, /* 1 */
     57    UOPTION_VERBOSE,            /* 2 */
     58    { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 },   /* 3 */
     59    { "wsrules", nullptr, nullptr, nullptr, 'w', UOPT_REQUIRES_ARG, 0},  /* 4 */  // deprecated
     60    { "out",   nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 },   /* 5 */
     61    UOPTION_ICUDATADIR,         /* 6 */
     62    UOPTION_DESTDIR,            /* 7 */
     63    UOPTION_COPYRIGHT,          /* 8 */
     64    UOPTION_QUIET,              /* 9 */
     65 };
     66 
     67 void usageAndDie(int retCode) {
     68        printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName);
     69        printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
     70            "options:\n"
     71            "\t-h or -? or --help  this usage text\n"
     72            "\t-V or --version     show a version message\n"
     73            "\t-c or --copyright   include a copyright notice\n"
     74            "\t-v or --verbose     turn on verbose output\n"
     75            "\t-q or --quiet       do not display warnings and progress\n"
     76            "\t-i or --icudatadir  directory for locating any needed intermediate data files,\n"
     77            "\t                    followed by path, defaults to %s\n"
     78            "\t-d or --destdir     destination directory, followed by the path\n",
     79            u_getDataDirectory());
     80        exit (retCode);
     81 }
     82 
     83 
     84 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
     85 
     86 /* dummy UDataInfo cf. udata.h */
     87 static UDataInfo dummyDataInfo = {
     88    sizeof(UDataInfo),
     89    0,
     90 
     91    U_IS_BIG_ENDIAN,
     92    U_CHARSET_FAMILY,
     93    U_SIZEOF_UCHAR,
     94    0,
     95 
     96    { 0, 0, 0, 0 },                 /* dummy dataFormat */
     97    { 0, 0, 0, 0 },                 /* dummy formatVersion */
     98    { 0, 0, 0, 0 }                  /* dummy dataVersion */
     99 };
    100 
    101 #else
    102 
    103 //
    104 //  Set up the ICU data header, defined in ucmndata.h
    105 //
    106 DataHeader dh ={
    107    {sizeof(DataHeader),           // Struct MappedData
    108        0xda,
    109        0x27},
    110 
    111    {                               // struct UDataInfo
    112        sizeof(UDataInfo),          //     size
    113        0,                          //     reserved
    114        U_IS_BIG_ENDIAN,
    115        U_CHARSET_FAMILY,
    116        U_SIZEOF_UCHAR,
    117        0,                          //     reserved
    118 
    119    { 0x43, 0x66, 0x75, 0x20 },     //     dataFormat="Cfu "
    120    { 0xff, 0, 0, 0 },              //     formatVersion.  Filled in later with values
    121                                    //      from the  builder.  The  values declared
    122                                    //      here should never appear in any real data.
    123        { 5, 1, 0, 0 }              //   dataVersion (Unicode version)
    124    }};
    125 
    126 #endif
    127 
    128 // Forward declaration for function for reading source files.
    129 static const char *readFile(const char *fileName, int32_t *len);
    130 
    131 //----------------------------------------------------------------------------
    132 //
    133 //  main      for gencfu
    134 //
    135 //----------------------------------------------------------------------------
    136 int  main(int argc, char **argv) {
    137    UErrorCode  status = U_ZERO_ERROR;
    138    const char *confFileName;
    139    const char *outFileName;
    140    const char *outDir = nullptr;
    141    const char *copyright = nullptr;
    142 
    143    //
    144    // Pick up and check the command line arguments,
    145    //    using the standard ICU tool utils option handling.
    146    //
    147    U_MAIN_INIT_ARGS(argc, argv);
    148    progName = argv[0];
    149    argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
    150    if(argc<0) {
    151        // Unrecognized option
    152        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
    153        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    154    }
    155 
    156    if(options[0].doesOccur || options[1].doesOccur) {
    157        //  -? or -h for help.
    158        usageAndDie(0);
    159    }
    160 
    161    if (!(options[3].doesOccur && options[5].doesOccur)) {
    162        fprintf(stderr, "confusables file and output file must all be specified.\n");
    163        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    164    }
    165    confFileName   = options[3].value;
    166    outFileName    = options[5].value;
    167 
    168    if (options[6].doesOccur) {
    169        u_setDataDirectory(options[6].value);
    170    }
    171 
    172    status = U_ZERO_ERROR;
    173 
    174    /* Combine the directory with the file name */
    175    if(options[7].doesOccur) {
    176        outDir = options[7].value;
    177    }
    178    if (options[8].doesOccur) {
    179        copyright = U_COPYRIGHT_STRING;
    180    }
    181 
    182    UBool quiet = false;
    183    if (options[9].doesOccur) {
    184      quiet = true;
    185    }
    186 
    187 #if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO
    188    // spoof detection data file parsing is dependent on regular expressions.
    189    // TODO: have the tool return an error status.  Requires fixing the ICU data build
    190    //       so that it doesn't abort entirely on that error.
    191 
    192    UNewDataMemory *pData;
    193    char msg[1024];
    194 
    195    /* write message with just the name */
    196    snprintf(msg, sizeof(msg), "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    197    fprintf(stderr, "%s\n", msg);
    198 
    199    /* write the dummy data file */
    200    pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status);
    201    udata_writeBlock(pData, msg, strlen(msg));
    202    udata_finish(pData, &status);
    203    return (int)status;
    204 
    205 #else
    206 
    207    //  Read in the confusables source file
    208 
    209    int32_t      confusablesLen = 0;
    210    const char  *confusables = readFile(confFileName, &confusablesLen);
    211    if (confusables == nullptr) {
    212        printf("gencfu: error reading file  \"%s\"\n", confFileName);
    213        exit(-1);
    214    }
    215 
    216    //
    217    //  Create the Spoof Detector from the source confusables files.
    218    //     This will compile the data.
    219    //
    220    UParseError parseError;
    221    parseError.line = 0;
    222    parseError.offset = 0;
    223    int32_t errType;
    224    USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
    225                                              nullptr, 0,
    226                                              &errType, &parseError, &status);
    227    if (U_FAILURE(status)) {
    228        fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\"  at file %s, line %d, column %d\n",
    229                u_errorName(status), confFileName, static_cast<int>(parseError.line), static_cast<int>(parseError.offset));
    230        exit(status);
    231    }
    232 
    233 
    234    //
    235    //  Get the compiled rule data from the USpoofChecker.
    236    //
    237    uint32_t        outDataSize;
    238    uint8_t        *outData;
    239    outDataSize = uspoof_serialize(sc, nullptr, 0, &status);
    240    if (status != U_BUFFER_OVERFLOW_ERROR) {
    241        fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
    242        exit(status);
    243    }
    244    status = U_ZERO_ERROR;
    245    outData = new uint8_t[outDataSize];
    246    uspoof_serialize(sc, outData, outDataSize, &status);
    247 
    248    // Copy the data format version numbers from the spoof data header into the UDataMemory header.
    249    
    250    uprv_memcpy(dh.info.formatVersion, 
    251                reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
    252                sizeof(dh.info.formatVersion));
    253 
    254    //
    255    //  Create the output file
    256    //
    257    size_t bytesWritten;
    258    UNewDataMemory *pData;
    259    pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status);
    260    if(U_FAILURE(status)) {
    261        fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n", 
    262                         outFileName, u_errorName(status));
    263        exit(status);
    264    }
    265 
    266 
    267    //  Write the data itself.
    268    udata_writeBlock(pData, outData, outDataSize);
    269    // finish up 
    270    bytesWritten = udata_finish(pData, &status);
    271    if(U_FAILURE(status)) {
    272        fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
    273        exit(status);
    274    }
    275    
    276    if (bytesWritten != outDataSize) {
    277        fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
    278        exit(-1);
    279    }
    280 
    281    uspoof_close(sc);
    282    delete [] outData;
    283    delete [] confusables;
    284    u_cleanup();
    285    if (!quiet) {
    286        printf("gencfu: tool completed successfully.\n");
    287    }
    288    return 0;
    289 #endif   // UCONFIG_NO_REGULAR_EXPRESSIONS
    290 }
    291 
    292 
    293 //
    294 //  Read in a confusables source file
    295 //
    296 static const char *readFile(const char *fileName, int32_t *len) {
    297    char       *result;
    298    long        fileSize;
    299    FILE        *file;
    300 
    301    file = fopen(fileName, "rb");
    302    if (file == nullptr) {
    303        return nullptr;
    304    }
    305    fseek(file, 0, SEEK_END);
    306    fileSize = ftell(file);
    307    fseek(file, 0, SEEK_SET);
    308    result = new char[fileSize+10];
    309    if (result==nullptr) {
    310        fclose(file);
    311        return nullptr;
    312    }
    313 
    314    long t = static_cast<long>(fread(result, 1, fileSize, file));
    315    if (t != fileSize)  {
    316        delete [] result;
    317        fclose(file);
    318        return nullptr;
    319    }
    320    result[fileSize]=0;
    321    *len = static_cast<int32_t>(fileSize);
    322    fclose(file);
    323    return result;
    324 }