[ tor-browser ].git.dasho

collationdatareader.cpp (19538B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatareader.cpp
      9 *
     10 * created on: 2013feb07
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/uscript.h"
     21 #include "cmemory.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 #include "collationdatareader.h"
     25 #include "collationfastlatin.h"
     26 #include "collationkeys.h"
     27 #include "collationrootelements.h"
     28 #include "collationsettings.h"
     29 #include "collationtailoring.h"
     30 #include "collunsafe.h"
     31 #include "normalizer2impl.h"
     32 #include "uassert.h"
     33 #include "ucmndata.h"
     34 #include "utrie2.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 namespace {
     39 
     40 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
     41    return (i < length) ? indexes[i] : -1;
     42 }
     43 
     44 }  // namespace
     45 
     46 void
     47 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
     48                          CollationTailoring &tailoring, UErrorCode &errorCode) {
     49    if(U_FAILURE(errorCode)) { return; }
     50    if(base != nullptr) {
     51        if(inBytes == nullptr || (0 <= inLength && inLength < 24)) {
     52            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     53            return;
     54        }
     55        const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
     56        if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
     57                isAcceptable(tailoring.version, nullptr, nullptr, &header->info))) {
     58            errorCode = U_INVALID_FORMAT_ERROR;
     59            return;
     60        }
     61        if(base->getUCAVersion() != tailoring.getUCAVersion()) {
     62            errorCode = U_COLLATOR_VERSION_MISMATCH;
     63            return;
     64        }
     65        int32_t headerLength = header->dataHeader.headerSize;
     66        inBytes += headerLength;
     67        if(inLength >= 0) {
     68            inLength -= headerLength;
     69        }
     70    }
     71 
     72    if(inBytes == nullptr || (0 <= inLength && inLength < 8)) {
     73        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     74        return;
     75    }
     76    const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
     77    int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
     78    if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
     79        errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.
     80        return;
     81    }
     82 
     83    // Assume that the tailoring data is in initial state,
     84    // with nullptr pointers and 0 lengths.
     85 
     86    // Set pointers to non-empty data parts.
     87    // Do this in order of their byte offsets. (Should help porting to Java.)
     88 
     89    int32_t index;  // one of the indexes[] slots
     90    int32_t offset;  // byte offset for the index part
     91    int32_t length;  // number of bytes in the index part
     92 
     93    if(indexesLength > IX_TOTAL_SIZE) {
     94        length = inIndexes[IX_TOTAL_SIZE];
     95    } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
     96        length = inIndexes[indexesLength - 1];
     97    } else {
     98        length = 0;  // only indexes, and inLength was already checked for them
     99    }
    100    if(0 <= inLength && inLength < length) {
    101        errorCode = U_INVALID_FORMAT_ERROR;
    102        return;
    103    }
    104 
    105    const CollationData *baseData = base == nullptr ? nullptr : base->data;
    106    const int32_t *reorderCodes = nullptr;
    107    int32_t reorderCodesLength = 0;
    108    const uint32_t *reorderRanges = nullptr;
    109    int32_t reorderRangesLength = 0;
    110    index = IX_REORDER_CODES_OFFSET;
    111    offset = getIndex(inIndexes, indexesLength, index);
    112    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    113    if(length >= 4) {
    114        if(baseData == nullptr) {
    115            // We assume for collation settings that
    116            // the base data does not have a reordering.
    117            errorCode = U_INVALID_FORMAT_ERROR;
    118            return;
    119        }
    120        reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
    121        reorderCodesLength = length / 4;
    122 
    123        // The reorderRanges (if any) are the trailing reorderCodes entries.
    124        // Split the array at the boundary.
    125        // Script or reorder codes do not exceed 16-bit values.
    126        // Range limits are stored in the upper 16 bits, and are never 0.
    127        while(reorderRangesLength < reorderCodesLength &&
    128                (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
    129            ++reorderRangesLength;
    130        }
    131        U_ASSERT(reorderRangesLength < reorderCodesLength);
    132        if(reorderRangesLength != 0) {
    133            reorderCodesLength -= reorderRangesLength;
    134            reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
    135        }
    136    }
    137 
    138    // There should be a reorder table only if there are reorder codes.
    139    // However, when there are reorder codes the reorder table may be omitted to reduce
    140    // the data size.
    141    const uint8_t *reorderTable = nullptr;
    142    index = IX_REORDER_TABLE_OFFSET;
    143    offset = getIndex(inIndexes, indexesLength, index);
    144    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    145    if(length >= 256) {
    146        if(reorderCodesLength == 0) {
    147            errorCode = U_INVALID_FORMAT_ERROR;  // Reordering table without reordering codes.
    148            return;
    149        }
    150        reorderTable = inBytes + offset;
    151    } else {
    152        // If we have reorder codes, then build the reorderTable at the end,
    153        // when the CollationData is otherwise complete.
    154    }
    155 
    156    if(baseData != nullptr && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
    157        errorCode = U_INVALID_FORMAT_ERROR;
    158        return;
    159    }
    160    CollationData *data = nullptr;  // Remains nullptr if there are no mappings.
    161 
    162    index = IX_TRIE_OFFSET;
    163    offset = getIndex(inIndexes, indexesLength, index);
    164    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    165    if(length >= 8) {
    166        if(!tailoring.ensureOwnedData(errorCode)) { return; }
    167        data = tailoring.ownedData;
    168        data->base = baseData;
    169        data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
    170        data->trie = tailoring.trie = utrie2_openFromSerialized(
    171            UTRIE2_32_VALUE_BITS, inBytes + offset, length, nullptr,
    172            &errorCode);
    173        if(U_FAILURE(errorCode)) { return; }
    174    } else if(baseData != nullptr) {
    175        // Use the base data. Only the settings are tailored.
    176        tailoring.data = baseData;
    177    } else {
    178        errorCode = U_INVALID_FORMAT_ERROR;  // No mappings.
    179        return;
    180    }
    181 
    182    index = IX_CES_OFFSET;
    183    offset = getIndex(inIndexes, indexesLength, index);
    184    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    185    if(length >= 8) {
    186        if(data == nullptr) {
    187            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ces without tailored trie.
    188            return;
    189        }
    190        data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
    191        data->cesLength = length / 8;
    192    }
    193 
    194    index = IX_CE32S_OFFSET;
    195    offset = getIndex(inIndexes, indexesLength, index);
    196    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    197    if(length >= 4) {
    198        if(data == nullptr) {
    199            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ce32s without tailored trie.
    200            return;
    201        }
    202        data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
    203        data->ce32sLength = length / 4;
    204    }
    205 
    206    int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
    207    if(jamoCE32sStart >= 0) {
    208        if(data == nullptr || data->ce32s == nullptr) {
    209            errorCode = U_INVALID_FORMAT_ERROR;  // Index into non-existent ce32s[].
    210            return;
    211        }
    212        data->jamoCE32s = data->ce32s + jamoCE32sStart;
    213    } else if(data == nullptr) {
    214        // Nothing to do.
    215    } else if(baseData != nullptr) {
    216        data->jamoCE32s = baseData->jamoCE32s;
    217    } else {
    218        errorCode = U_INVALID_FORMAT_ERROR;  // No Jamo CE32s for Hangul processing.
    219        return;
    220    }
    221 
    222    index = IX_ROOT_ELEMENTS_OFFSET;
    223    offset = getIndex(inIndexes, indexesLength, index);
    224    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    225    if(length >= 4) {
    226        length /= 4;
    227        if(data == nullptr || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
    228            errorCode = U_INVALID_FORMAT_ERROR;
    229            return;
    230        }
    231        data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
    232        data->rootElementsLength = length;
    233        uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
    234        if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
    235            errorCode = U_INVALID_FORMAT_ERROR;
    236            return;
    237        }
    238        uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
    239        if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
    240            // [fixed last secondary common byte] is too low,
    241            // and secondary weights would collide with compressed common secondaries.
    242            errorCode = U_INVALID_FORMAT_ERROR;
    243            return;
    244        }
    245    }
    246 
    247    index = IX_CONTEXTS_OFFSET;
    248    offset = getIndex(inIndexes, indexesLength, index);
    249    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    250    if(length >= 2) {
    251        if(data == nullptr) {
    252            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored contexts without tailored trie.
    253            return;
    254        }
    255        data->contexts = reinterpret_cast<const char16_t *>(inBytes + offset);
    256        data->contextsLength = length / 2;
    257    }
    258 
    259    index = IX_UNSAFE_BWD_OFFSET;
    260    offset = getIndex(inIndexes, indexesLength, index);
    261    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    262    if(length >= 2) {
    263        if(data == nullptr) {
    264            errorCode = U_INVALID_FORMAT_ERROR;
    265            return;
    266        }
    267        if(baseData == nullptr) {
    268 #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
    269          tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
    270          if(tailoring.unsafeBackwardSet == nullptr) {
    271            errorCode = U_MEMORY_ALLOCATION_ERROR;
    272            return;
    273          } else if (U_FAILURE(errorCode)) {
    274            return;
    275          }
    276 #else
    277            // Create the unsafe-backward set for the root collator.
    278            // Include all non-zero combining marks and trail surrogates.
    279            // We do this at load time, rather than at build time,
    280            // to simplify Unicode version bootstrapping:
    281            // The root data builder only needs the new FractionalUCA.txt data,
    282            // but it need not be built with a version of ICU already updated to
    283            // the corresponding new Unicode Character Database.
    284            //
    285            // The following is an optimized version of
    286            // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
    287            // It is faster and requires fewer code dependencies.
    288            tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
    289            if(tailoring.unsafeBackwardSet == nullptr) {
    290                errorCode = U_MEMORY_ALLOCATION_ERROR;
    291                return;
    292            }
    293            data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
    294 #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
    295        } else {
    296            // Clone the root collator's set contents.
    297            tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
    298                baseData->unsafeBackwardSet->cloneAsThawed());
    299            if(tailoring.unsafeBackwardSet == nullptr) {
    300                errorCode = U_MEMORY_ALLOCATION_ERROR;
    301                return;
    302            }
    303        }
    304        // Add the ranges from the data file to the unsafe-backward set.
    305        USerializedSet sset;
    306        const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
    307        if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
    308            errorCode = U_INVALID_FORMAT_ERROR;
    309            return;
    310        }
    311        int32_t count = uset_getSerializedRangeCount(&sset);
    312        for(int32_t i = 0; i < count; ++i) {
    313            UChar32 start, end;
    314            uset_getSerializedRange(&sset, i, &start, &end);
    315            tailoring.unsafeBackwardSet->add(start, end);
    316        }
    317        // Mark each lead surrogate as "unsafe"
    318        // if any of its 1024 associated supplementary code points is "unsafe".
    319        UChar32 c = 0x10000;
    320        for(char16_t lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
    321            if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
    322                tailoring.unsafeBackwardSet->add(lead);
    323            }
    324        }
    325        tailoring.unsafeBackwardSet->freeze();
    326        data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
    327    } else if(data == nullptr) {
    328        // Nothing to do.
    329    } else if(baseData != nullptr) {
    330        // No tailoring-specific data: Alias the root collator's set.
    331        data->unsafeBackwardSet = baseData->unsafeBackwardSet;
    332    } else {
    333        errorCode = U_INVALID_FORMAT_ERROR;  // No unsafeBackwardSet.
    334        return;
    335    }
    336 
    337    // If the fast Latin format version is different,
    338    // or the version is set to 0 for "no fast Latin table",
    339    // then just always use the normal string comparison path.
    340    if(data != nullptr) {
    341        data->fastLatinTable = nullptr;
    342        data->fastLatinTableLength = 0;
    343        if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
    344            index = IX_FAST_LATIN_TABLE_OFFSET;
    345            offset = getIndex(inIndexes, indexesLength, index);
    346            length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    347            if(length >= 2) {
    348                data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
    349                data->fastLatinTableLength = length / 2;
    350                if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
    351                    errorCode = U_INVALID_FORMAT_ERROR;  // header vs. table version mismatch
    352                    return;
    353                }
    354            } else if(baseData != nullptr) {
    355                data->fastLatinTable = baseData->fastLatinTable;
    356                data->fastLatinTableLength = baseData->fastLatinTableLength;
    357            }
    358        }
    359    }
    360 
    361    index = IX_SCRIPTS_OFFSET;
    362    offset = getIndex(inIndexes, indexesLength, index);
    363    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    364    if(length >= 2) {
    365        if(data == nullptr) {
    366            errorCode = U_INVALID_FORMAT_ERROR;
    367            return;
    368        }
    369        const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
    370        int32_t scriptsLength = length / 2;
    371        data->numScripts = scripts[0];
    372        // There must be enough entries for both arrays, including more than two range starts.
    373        data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
    374        if(data->scriptStartsLength <= 2 ||
    375                CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
    376            errorCode = U_INVALID_FORMAT_ERROR;
    377            return;
    378        }
    379        data->scriptsIndex = scripts + 1;
    380        data->scriptStarts = scripts + 1 + data->numScripts + 16;
    381        if(!(data->scriptStarts[0] == 0 &&
    382                data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
    383                data->scriptStarts[data->scriptStartsLength - 1] ==
    384                        (Collation::TRAIL_WEIGHT_BYTE << 8))) {
    385            errorCode = U_INVALID_FORMAT_ERROR;
    386            return;
    387        }
    388    } else if(data == nullptr) {
    389        // Nothing to do.
    390    } else if(baseData != nullptr) {
    391        data->numScripts = baseData->numScripts;
    392        data->scriptsIndex = baseData->scriptsIndex;
    393        data->scriptStarts = baseData->scriptStarts;
    394        data->scriptStartsLength = baseData->scriptStartsLength;
    395    }
    396 
    397    index = IX_COMPRESSIBLE_BYTES_OFFSET;
    398    offset = getIndex(inIndexes, indexesLength, index);
    399    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    400    if(length >= 256) {
    401        if(data == nullptr) {
    402            errorCode = U_INVALID_FORMAT_ERROR;
    403            return;
    404        }
    405        data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
    406    } else if(data == nullptr) {
    407        // Nothing to do.
    408    } else if(baseData != nullptr) {
    409        data->compressibleBytes = baseData->compressibleBytes;
    410    } else {
    411        errorCode = U_INVALID_FORMAT_ERROR;  // No compressibleBytes[].
    412        return;
    413    }
    414 
    415    const CollationSettings &ts = *tailoring.settings;
    416    int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
    417    uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
    418    int32_t fastLatinOptions = CollationFastLatin::getOptions(
    419            tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
    420    if(options == ts.options && ts.variableTop != 0 &&
    421            reorderCodesLength == ts.reorderCodesLength &&
    422            (reorderCodesLength == 0 ||
    423                uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0) &&
    424            fastLatinOptions == ts.fastLatinOptions &&
    425            (fastLatinOptions < 0 ||
    426                uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
    427                            sizeof(fastLatinPrimaries)) == 0)) {
    428        return;
    429    }
    430 
    431    CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
    432    if(settings == nullptr) {
    433        errorCode = U_MEMORY_ALLOCATION_ERROR;
    434        return;
    435    }
    436    settings->options = options;
    437    // Set variableTop from options and scripts data.
    438    settings->variableTop = tailoring.data->getLastPrimaryForGroup(
    439            UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
    440    if(settings->variableTop == 0) {
    441        errorCode = U_INVALID_FORMAT_ERROR;
    442        return;
    443    }
    444 
    445    if(reorderCodesLength != 0) {
    446        settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
    447                                  reorderRanges, reorderRangesLength,
    448                                  reorderTable, errorCode);
    449    }
    450 
    451    settings->fastLatinOptions = CollationFastLatin::getOptions(
    452        tailoring.data, *settings,
    453        settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
    454 }
    455 
    456 UBool U_CALLCONV
    457 CollationDataReader::isAcceptable(void *context,
    458                                  const char * /* type */, const char * /*name*/,
    459                                  const UDataInfo *pInfo) {
    460    if(
    461        pInfo->size >= 20 &&
    462        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
    463        pInfo->charsetFamily == U_CHARSET_FAMILY &&
    464        pInfo->dataFormat[0] == 0x55 &&  // dataFormat="UCol"
    465        pInfo->dataFormat[1] == 0x43 &&
    466        pInfo->dataFormat[2] == 0x6f &&
    467        pInfo->dataFormat[3] == 0x6c &&
    468        pInfo->formatVersion[0] == 5
    469    ) {
    470        UVersionInfo *version = static_cast<UVersionInfo *>(context);
    471        if(version != nullptr) {
    472            uprv_memcpy(version, pInfo->dataVersion, 4);
    473        }
    474        return true;
    475    } else {
    476        return false;
    477    }
    478 }
    479 
    480 U_NAMESPACE_END
    481 
    482 #endif  // !UCONFIG_NO_COLLATION
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE