tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

uspoof_conf.cpp (17856B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2008-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  uspoof_conf.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009Jan05  (refactoring earlier files)
     16 *   created by: Andy Heninger
     17 *
     18 *   Internal classes for compiling confusable data into its binary (runtime) form.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/uspoof.h"
     23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     24 #if !UCONFIG_NO_NORMALIZATION
     25 
     26 #include "unicode/unorm.h"
     27 #include "unicode/uregex.h"
     28 #include "unicode/ustring.h"
     29 #include "cmemory.h"
     30 #include "uspoof_impl.h"
     31 #include "uhash.h"
     32 #include "uvector.h"
     33 #include "uassert.h"
     34 #include "uarrsort.h"
     35 #include "uspoof_conf.h"
     36 
     37 U_NAMESPACE_USE
     38 
     39 
     40 //---------------------------------------------------------------------
     41 //
     42 //  buildConfusableData   Compile the source confusable data, as defined by
     43 //                        the Unicode data file confusables.txt, into the binary
     44 //                        structures used by the confusable detector.
     45 //
     46 //                        The binary structures are described in uspoof_impl.h
     47 //
     48 //     1.  Parse the data, making a hash table mapping from a UChar32 to a String.
     49 //
     50 //     2.  Sort all of the strings encountered by length, since they will need to
     51 //         be stored in that order in the final string table.
     52 //         TODO: Sorting these strings by length is no longer needed since the removal of
     53 //         the string lengths table.  This logic can be removed to save processing time
     54 //         when building confusables data.
     55 //
     56 //     3.  Build a list of keys (UChar32s) from the four mapping tables.  Sort the
     57 //         list because that will be the ordering of our runtime table.
     58 //
     59 //     4.  Generate the run time string table.  This is generated before the key & value
     60 //         tables because we need the string indexes when building those tables.
     61 //
     62 //     5.  Build the run-time key and value tables.  These are parallel tables, and are built
     63 //         at the same time
     64 //
     65 
     66 SPUString::SPUString(LocalPointer<UnicodeString> s) {
     67    fStr = std::move(s);
     68    fCharOrStrTableIndex = 0;
     69 }
     70 
     71 
     72 SPUString::~SPUString() {
     73 }
     74 
     75 
     76 SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(nullptr), fHash(nullptr) {
     77    LocalPointer<UVector> vec(new UVector(status), status);
     78    if (U_FAILURE(status)) {
     79        return;
     80    }
     81    vec->setDeleter(
     82        [](void *obj) {delete static_cast<SPUString*>(obj);});
     83    fVec = vec.orphan();
     84    fHash = uhash_open(uhash_hashUnicodeString,           // key hash function
     85                       uhash_compareUnicodeString,        // Key Comparator
     86                       nullptr,                              // Value Comparator
     87                       &status);
     88 }
     89 
     90 
     91 SPUStringPool::~SPUStringPool() {
     92    delete fVec;
     93    uhash_close(fHash);
     94 }
     95 
     96 
     97 int32_t SPUStringPool::size() {
     98    return fVec->size();
     99 }
    100 
    101 SPUString *SPUStringPool::getByIndex(int32_t index) {
    102    SPUString* retString = static_cast<SPUString*>(fVec->elementAt(index));
    103    return retString;
    104 }
    105 
    106 
    107 // Comparison function for ordering strings in the string pool.
    108 // Compare by length first, then, within a group of the same length,
    109 // by code point order.
    110 // Conforms to the type signature for a USortComparator in uvector.h
    111 
    112 static int32_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
    113 const SPUString *sL = const_cast<const SPUString *>(
    114        static_cast<SPUString *>(left.pointer));
    115 	const SPUString *sR = const_cast<const SPUString *>(
    116 	    static_cast<SPUString *>(right.pointer));
    117    int32_t lenL = sL->fStr->length();
    118    int32_t lenR = sR->fStr->length();
    119    if (lenL < lenR) {
    120        return -1;
    121    } else if (lenL > lenR) {
    122        return 1;
    123    } else {
    124        return sL->fStr->compare(*(sR->fStr));
    125    }
    126 }
    127 
    128 void SPUStringPool::sort(UErrorCode &status) {
    129    fVec->sort(SPUStringCompare, status);
    130 }
    131 
    132 
    133 SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
    134    LocalPointer<UnicodeString> lpSrc(src);
    135    if (U_FAILURE(status)) {
    136        return nullptr;
    137    }
    138    SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
    139    if (hashedString != nullptr) {
    140        return hashedString;
    141    }
    142    LocalPointer<SPUString> spuStr(new SPUString(std::move(lpSrc)), status);
    143    hashedString = spuStr.getAlias();
    144    fVec->adoptElement(spuStr.orphan(), status);
    145    if (U_FAILURE(status)) {
    146        return nullptr;
    147    }
    148    uhash_put(fHash, src, hashedString, &status);
    149    return hashedString;
    150 }
    151 
    152 
    153 
    154 ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
    155    fSpoofImpl(spImpl),
    156    fInput(nullptr),
    157    fTable(nullptr),
    158    fKeySet(nullptr),
    159    fKeyVec(nullptr),
    160    fValueVec(nullptr),
    161    fStringTable(nullptr),
    162    stringPool(nullptr),
    163    fParseLine(nullptr),
    164    fParseHexNum(nullptr),
    165    fLineNum(0)
    166 {
    167    if (U_FAILURE(status)) {
    168        return;
    169    }
    170 
    171    fTable = uhash_open(uhash_hashLong, uhash_compareLong, nullptr, &status);
    172 
    173    fKeySet = new UnicodeSet();
    174    if (fKeySet == nullptr) {
    175        status = U_MEMORY_ALLOCATION_ERROR;
    176        return;
    177    }
    178 
    179    fKeyVec = new UVector(status);
    180    if (fKeyVec == nullptr) {
    181        status = U_MEMORY_ALLOCATION_ERROR;
    182        return;
    183    }
    184 
    185    fValueVec = new UVector(status);
    186    if (fValueVec == nullptr) {
    187        status = U_MEMORY_ALLOCATION_ERROR;
    188        return;
    189    }
    190 
    191    stringPool = new SPUStringPool(status);
    192    if (stringPool == nullptr) {
    193        status = U_MEMORY_ALLOCATION_ERROR;
    194        return;
    195    }
    196 }
    197 
    198 
    199 ConfusabledataBuilder::~ConfusabledataBuilder() {
    200    uprv_free(fInput);
    201    uregex_close(fParseLine);
    202    uregex_close(fParseHexNum);
    203    uhash_close(fTable);
    204    delete fKeySet;
    205    delete fKeyVec;
    206    delete fStringTable;
    207    delete fValueVec;
    208    delete stringPool;
    209 }
    210 
    211 
    212 void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
    213    int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
    214 
    215    if (U_FAILURE(status)) {
    216        return;
    217    }
    218    ConfusabledataBuilder builder(spImpl, status);
    219    builder.build(confusables, confusablesLen, status);
    220    if (U_FAILURE(status) && errorType != nullptr) {
    221        *errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    222        pe->line = builder.fLineNum;
    223    }
    224 }
    225 
    226 
    227 void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
    228               UErrorCode &status) {
    229 
    230    // Convert the user input data from UTF-8 to char16_t (UTF-16)
    231    int32_t inputLen = 0;
    232    if (U_FAILURE(status)) {
    233        return;
    234    }
    235    UErrorCode bufferStatus = U_ZERO_ERROR;
    236    u_strFromUTF8(nullptr, 0, &inputLen, confusables, confusablesLen, &bufferStatus);
    237    if (bufferStatus != U_BUFFER_OVERFLOW_ERROR) {
    238        if (U_FAILURE(bufferStatus)) {
    239            status = bufferStatus;
    240        }
    241        return;
    242    }
    243    fInput = static_cast<char16_t *>(uprv_malloc((inputLen+1) * sizeof(char16_t)));
    244    if (fInput == nullptr) {
    245        status = U_MEMORY_ALLOCATION_ERROR;
    246        return;
    247    }
    248    u_strFromUTF8(fInput, inputLen+1, nullptr, confusables, confusablesLen, &status);
    249 
    250 
    251    // Regular Expression to parse a line from Confusables.txt.  The expression will match
    252    // any line.  What was matched is determined by examining which capture groups have a match.
    253    //   Capture Group 1:  the source char
    254    //   Capture Group 2:  the replacement chars
    255    //   Capture Group 3-6  the table type, SL, SA, ML, or MA (deprecated)
    256    //   Capture Group 7:  A blank or comment only line.
    257    //   Capture Group 8:  A syntactically invalid line.  Anything that didn't match before.
    258    // Example Line from the confusables.txt source file:
    259    //   "1D702 ;	006E 0329 ;	SL	# MATHEMATICAL ITALIC SMALL ETA ... "
    260    UnicodeString pattern(
    261        "(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;"      // Match the source char
    262        "[ \\t]*([0-9A-Fa-f]+"                    // Match the replacement char(s)
    263           "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;"    //     (continued)
    264        "\\s*(?:(SL)|(SA)|(ML)|(MA))"             // Match the table type
    265        "[ \\t]*(?:#.*?)?$"                       // Match any trailing #comment
    266        "|^([ \\t]*(?:#.*?)?)$"       // OR match empty lines or lines with only a #comment
    267        "|^(.*?)$", -1, US_INV);      // OR match any line, which catches illegal lines.
    268    // TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
    269    fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
    270 
    271    // Regular expression for parsing a hex number out of a space-separated list of them.
    272    //   Capture group 1 gets the number, with spaces removed.
    273    pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
    274    fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), 0, nullptr, &status);
    275 
    276    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    277    //   given the syntax of the input.
    278    if (*fInput == 0xfeff) {
    279        *fInput = 0x20;
    280    }
    281 
    282    // Parse the input, one line per iteration of this loop.
    283    uregex_setText(fParseLine, fInput, inputLen, &status);
    284    while (uregex_findNext(fParseLine, &status)) {
    285        fLineNum++;
    286        if (uregex_start(fParseLine, 7, &status) >= 0) {
    287            // this was a blank or comment line.
    288            continue;
    289        }
    290        if (uregex_start(fParseLine, 8, &status) >= 0) {
    291            // input file syntax error.
    292            status = U_PARSE_ERROR;
    293            return;
    294        }
    295 
    296        // We have a good input line.  Extract the key character and mapping string, and
    297        //    put them into the appropriate mapping table.
    298        UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
    299                          uregex_end(fParseLine, 1, &status), status);
    300 
    301        int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
    302        int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
    303        uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
    304 
    305        UnicodeString  *mapString = new UnicodeString();
    306        if (mapString == nullptr) {
    307            status = U_MEMORY_ALLOCATION_ERROR;
    308            return;
    309        }
    310        while (uregex_findNext(fParseHexNum, &status)) {
    311            UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
    312                                 uregex_end(fParseHexNum, 1, &status), status);
    313            mapString->append(c);
    314        }
    315        U_ASSERT(mapString->length() >= 1);
    316 
    317        // Put the map (value) string into the string pool
    318        // This a little like a Java intern() - any duplicates will be eliminated.
    319        SPUString *smapString = stringPool->addString(mapString, status);
    320 
    321        // Add the UChar32 -> string mapping to the table.
    322        // For Unicode 8, the SL, SA and ML tables have been discontinued.
    323        //                All input data from confusables.txt is tagged MA.
    324        uhash_iput(fTable, keyChar, smapString, &status);
    325        if (U_FAILURE(status)) { return; }
    326        fKeySet->add(keyChar);
    327    }
    328 
    329    // Input data is now all parsed and collected.
    330    // Now create the run-time binary form of the data.
    331    //
    332    // This is done in two steps.  First the data is assembled into vectors and strings,
    333    //   for ease of construction, then the contents of these collections are dumped
    334    //   into the actual raw-bytes data storage.
    335 
    336    // Build up the string array, and record the index of each string therein
    337    //  in the (build time only) string pool.
    338    // Strings of length one are not entered into the strings array.
    339    // (Strings in the table are sorted by length)
    340    stringPool->sort(status);
    341    fStringTable = new UnicodeString();
    342    int32_t poolSize = stringPool->size();
    343    int32_t i;
    344    for (i=0; i<poolSize; i++) {
    345        SPUString *s = stringPool->getByIndex(i);
    346        int32_t strLen = s->fStr->length();
    347        int32_t strIndex = fStringTable->length();
    348        if (strLen == 1) {
    349            // strings of length one do not get an entry in the string table.
    350            // Keep the single string character itself here, which is the same
    351            //  convention that is used in the final run-time string table index.
    352            s->fCharOrStrTableIndex = s->fStr->charAt(0);
    353        } else {
    354            s->fCharOrStrTableIndex = strIndex;
    355            fStringTable->append(*(s->fStr));
    356        }
    357    }
    358 
    359    // Construct the compile-time Key and Value tables
    360    //
    361    // For each key code point, check which mapping tables it applies to,
    362    //   and create the final data for the key & value structures.
    363    //
    364    //   The four logical mapping tables are conflated into one combined table.
    365    //   If multiple logical tables have the same mapping for some key, they
    366    //     share a single entry in the combined table.
    367    //   If more than one mapping exists for the same key code point, multiple
    368    //     entries will be created in the table
    369 
    370    for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
    371        // It is an oddity of the UnicodeSet API that simply enumerating the contained
    372        //   code points requires a nested loop.
    373        for (UChar32 keyChar=fKeySet->getRangeStart(range);
    374                keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
    375            SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(fTable, keyChar));
    376            U_ASSERT(targetMapping != nullptr);
    377 
    378            // Set an error code if trying to consume a long string.  Otherwise,
    379            // codePointAndLengthToKey will abort on a U_ASSERT.
    380            if (targetMapping->fStr->length() > 256) {
    381                status = U_ILLEGAL_ARGUMENT_ERROR;
    382                return;
    383            }
    384 
    385            int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
    386                targetMapping->fStr->length());
    387            int32_t value = targetMapping->fCharOrStrTableIndex;
    388 
    389            fKeyVec->addElement(key, status);
    390            fValueVec->addElement(value, status);
    391        }
    392    }
    393 
    394    // Put the assembled data into the flat runtime array
    395    outputData(status);
    396 
    397    // All of the intermediate allocated data belongs to the ConfusabledataBuilder
    398    //  object  (this), and is deleted in the destructor.
    399 }
    400 
    401 //
    402 // outputData     The confusable data has been compiled and stored in intermediate
    403 //                collections and strings.  Copy it from there to the final flat
    404 //                binary array.
    405 //
    406 //                Note that as each section is added to the output data, the
    407 //                expand (reserveSpace() function will likely relocate it in memory.
    408 //                Be careful with pointers.
    409 //
    410 void ConfusabledataBuilder::outputData(UErrorCode &status) {
    411 
    412    U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned);
    413 
    414    //  The Key Table
    415    //     While copying the keys to the runtime array,
    416    //       also sanity check that they are sorted.
    417 
    418    int32_t numKeys = fKeyVec->size();
    419    int32_t *keys =
    420        static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
    421    if (U_FAILURE(status)) {
    422        return;
    423    }
    424    int i;
    425    UChar32 previousCodePoint = 0;
    426    for (i=0; i<numKeys; i++) {
    427        int32_t key =  fKeyVec->elementAti(i);
    428        UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
    429        (void)previousCodePoint;    // Suppress unused variable warning.
    430        // strictly greater because there can be only one entry per code point
    431        U_ASSERT(codePoint > previousCodePoint);
    432        keys[i] = key;
    433        previousCodePoint = codePoint;
    434    }
    435    SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
    436    rawData->fCFUKeys = static_cast<int32_t>(reinterpret_cast<char*>(keys) - reinterpret_cast<char*>(rawData));
    437    rawData->fCFUKeysSize = numKeys;
    438    fSpoofImpl->fSpoofData->fCFUKeys = keys;
    439 
    440 
    441    // The Value Table, parallels the key table
    442    int32_t numValues = fValueVec->size();
    443    U_ASSERT(numKeys == numValues);
    444    uint16_t *values =
    445        static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
    446    if (U_FAILURE(status)) {
    447        return;
    448    }
    449    for (i=0; i<numValues; i++) {
    450        uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
    451        U_ASSERT(value < 0xffff);
    452        values[i] = static_cast<uint16_t>(value);
    453    }
    454    rawData = fSpoofImpl->fSpoofData->fRawData;
    455    rawData->fCFUStringIndex = static_cast<int32_t>(reinterpret_cast<char*>(values) - reinterpret_cast<char*>(rawData));
    456    rawData->fCFUStringIndexSize = numValues;
    457    fSpoofImpl->fSpoofData->fCFUValues = values;
    458 
    459    // The Strings Table.
    460 
    461    uint32_t stringsLength = fStringTable->length();
    462    // Reserve an extra space so the string will be nul-terminated.  This is
    463    // only a convenience, for when debugging; it is not needed otherwise.
    464    char16_t *strings =
    465        static_cast<char16_t *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(char16_t)+2, status));
    466    if (U_FAILURE(status)) {
    467        return;
    468    }
    469    fStringTable->extract(strings, stringsLength+1, status);
    470    rawData = fSpoofImpl->fSpoofData->fRawData;
    471    U_ASSERT(rawData->fCFUStringTable == 0);
    472    rawData->fCFUStringTable = static_cast<int32_t>(reinterpret_cast<char*>(strings) - reinterpret_cast<char*>(rawData));
    473    rawData->fCFUStringTableLen = stringsLength;
    474    fSpoofImpl->fSpoofData->fCFUStrings = strings;
    475 }
    476 
    477 #endif
    478 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS