tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdatawriter.cpp (14153B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdatawriter.cpp
      9 *
     10 * created on: 2013aug06
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/uniset.h"
     21 #include "cmemory.h"
     22 #include "collationdata.h"
     23 #include "collationdatabuilder.h"
     24 #include "collationdatareader.h"
     25 #include "collationdatawriter.h"
     26 #include "collationfastlatin.h"
     27 #include "collationsettings.h"
     28 #include "collationtailoring.h"
     29 #include "uassert.h"
     30 #include "ucmndata.h"
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 uint8_t *
     35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
     36    if(U_FAILURE(errorCode)) { return nullptr; }
     37    LocalMemory<uint8_t> buffer(static_cast<uint8_t*>(uprv_malloc(20000)));
     38    if(buffer.isNull()) {
     39        errorCode = U_MEMORY_ALLOCATION_ERROR;
     40        return nullptr;
     41    }
     42    UErrorCode bufferStatus = U_ZERO_ERROR;
     43    length = cloneBinary(buffer.getAlias(), 20000, bufferStatus);
     44    if(bufferStatus == U_BUFFER_OVERFLOW_ERROR) {
     45        if(buffer.allocateInsteadAndCopy(length, 0) == nullptr) {
     46            errorCode = U_MEMORY_ALLOCATION_ERROR;
     47            return nullptr;
     48        }
     49        bufferStatus = U_ZERO_ERROR;
     50        length = cloneBinary(buffer.getAlias(), length, bufferStatus);
     51    }
     52    if(U_FAILURE(bufferStatus)) {
     53        errorCode = bufferStatus;
     54        return nullptr;
     55    }
     56    return buffer.orphan();
     57 }
     58 
     59 int32_t
     60 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
     61    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
     62    return CollationDataWriter::writeTailoring(
     63            *tailoring, *settings, indexes, dest, capacity,
     64            errorCode);
     65 }
     66 
     67 static const UDataInfo dataInfo = {
     68    sizeof(UDataInfo),
     69    0,
     70 
     71    U_IS_BIG_ENDIAN,
     72    U_CHARSET_FAMILY,
     73    U_SIZEOF_UCHAR,
     74    0,
     75 
     76    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
     77    { 5, 0, 0, 0 },                     // formatVersion
     78    { 6, 3, 0, 0 }                      // dataVersion
     79 };
     80 
     81 int32_t
     82 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
     83                               const void *rootElements, int32_t rootElementsLength,
     84                               int32_t indexes[], uint8_t *dest, int32_t capacity,
     85                               UErrorCode &errorCode) {
     86    return write(true, nullptr,
     87                 data, settings,
     88                 rootElements, rootElementsLength,
     89                 indexes, dest, capacity, errorCode);
     90 }
     91 
     92 int32_t
     93 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
     94                                    int32_t indexes[], uint8_t *dest, int32_t capacity,
     95                                    UErrorCode &errorCode) {
     96    return write(false, t.version,
     97                 *t.data, settings,
     98                 nullptr, 0,
     99                 indexes, dest, capacity, errorCode);
    100 }
    101 
    102 int32_t
    103 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
    104                           const CollationData &data, const CollationSettings &settings,
    105                           const void *rootElements, int32_t rootElementsLength,
    106                           int32_t indexes[], uint8_t *dest, int32_t capacity,
    107                           UErrorCode &errorCode) {
    108    if(U_FAILURE(errorCode)) { return 0; }
    109    if(capacity < 0 || (capacity > 0 && dest == nullptr)) {
    110        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    111        return 0;
    112    }
    113 
    114    // Figure out which data items to write before settling on
    115    // the indexes length and writing offsets.
    116    // For any data item, we need to write the start and limit offsets,
    117    // so the indexes length must be at least index-of-start-offset + 2.
    118    int32_t indexesLength;
    119    UBool hasMappings;
    120    UnicodeSet unsafeBackwardSet;
    121    const CollationData *baseData = data.base;
    122 
    123    int32_t fastLatinVersion;
    124    if(data.fastLatinTable != nullptr) {
    125        fastLatinVersion = static_cast<int32_t>(CollationFastLatin::VERSION) << 16;
    126    } else {
    127        fastLatinVersion = 0;
    128    }
    129    int32_t fastLatinTableLength = 0;
    130 
    131    if(isBase) {
    132        // For the root collator, we write an even number of indexes
    133        // so that we start with an 8-aligned offset.
    134        indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
    135        U_ASSERT(settings.reorderCodesLength == 0);
    136        hasMappings = true;
    137        unsafeBackwardSet = *data.unsafeBackwardSet;
    138        fastLatinTableLength = data.fastLatinTableLength;
    139    } else if(baseData == nullptr) {
    140        hasMappings = false;
    141        if(settings.reorderCodesLength == 0) {
    142            // only options
    143            indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
    144        } else {
    145            // only options, reorder codes, and the reorder table
    146            indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
    147        }
    148    } else {
    149        hasMappings = true;
    150        // Tailored mappings, and what else?
    151        // Check in ascending order of optional tailoring data items.
    152        indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
    153        if(data.contextsLength != 0) {
    154            indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
    155        }
    156        unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
    157        if(!unsafeBackwardSet.isEmpty()) {
    158            indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
    159        }
    160        if(data.fastLatinTable != baseData->fastLatinTable) {
    161            fastLatinTableLength = data.fastLatinTableLength;
    162            indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
    163        }
    164    }
    165 
    166    UVector32 codesAndRanges(errorCode);
    167    const int32_t *reorderCodes = settings.reorderCodes;
    168    int32_t reorderCodesLength = settings.reorderCodesLength;
    169    if(settings.hasReordering() &&
    170            CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
    171        // Rebuild the full list of reorder ranges.
    172        // The list in the settings is truncated for efficiency.
    173        data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
    174        // Write the codes, then the ranges.
    175        for(int32_t i = 0; i < reorderCodesLength; ++i) {
    176            codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
    177        }
    178        if(U_FAILURE(errorCode)) { return 0; }
    179        reorderCodes = codesAndRanges.getBuffer();
    180        reorderCodesLength = codesAndRanges.size();
    181    }
    182 
    183    int32_t headerSize;
    184    if(isBase) {
    185        headerSize = 0;  // udata_create() writes the header
    186    } else {
    187        DataHeader header;
    188        header.dataHeader.magic1 = 0xda;
    189        header.dataHeader.magic2 = 0x27;
    190        uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
    191        uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
    192        headerSize = static_cast<int32_t>(sizeof(header));
    193        U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
    194        if(hasMappings && data.cesLength != 0) {
    195            // Sum of the sizes of the data items which are
    196            // not automatically multiples of 8 bytes and which are placed before the CEs.
    197            int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
    198            if((sum & 7) != 0) {
    199                // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
    200                // We add to the header size here.
    201                // Alternatively, we could increment the indexesLength
    202                // or add a few bytes to the reorderTable.
    203                headerSize += 4;
    204            }
    205        }
    206        header.dataHeader.headerSize = static_cast<uint16_t>(headerSize);
    207        if(headerSize <= capacity) {
    208            uprv_memcpy(dest, &header, sizeof(header));
    209            // Write 00 bytes so that the padding is not mistaken for a copyright string.
    210            uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
    211            dest += headerSize;
    212            capacity -= headerSize;
    213        } else {
    214            dest = nullptr;
    215            capacity = 0;
    216        }
    217    }
    218 
    219    indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
    220    U_ASSERT((settings.options & ~0xffff) == 0);
    221    indexes[CollationDataReader::IX_OPTIONS] =
    222            data.numericPrimary | fastLatinVersion | settings.options;
    223    indexes[CollationDataReader::IX_RESERVED2] = 0;
    224    indexes[CollationDataReader::IX_RESERVED3] = 0;
    225 
    226    // Byte offsets of data items all start from the start of the indexes.
    227    // We add the headerSize at the very end.
    228    int32_t totalSize = indexesLength * 4;
    229 
    230    if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
    231        indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
    232    } else {
    233        indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
    234    }
    235 
    236    indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
    237    totalSize += reorderCodesLength * 4;
    238 
    239    indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
    240    if(settings.reorderTable != nullptr) {
    241        totalSize += 256;
    242    }
    243 
    244    indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
    245    if(hasMappings) {
    246        UErrorCode errorCode2 = U_ZERO_ERROR;
    247        int32_t length;
    248        if(totalSize < capacity) {
    249            length = utrie2_serialize(data.trie, dest + totalSize,
    250                                      capacity - totalSize, &errorCode2);
    251        } else {
    252            length = utrie2_serialize(data.trie, nullptr, 0, &errorCode2);
    253        }
    254        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    255            errorCode = errorCode2;
    256            return 0;
    257        }
    258        // The trie size should be a multiple of 8 bytes due to the way
    259        // compactIndex2(UNewTrie2 *trie) currently works.
    260        U_ASSERT((length & 7) == 0);
    261        totalSize += length;
    262    }
    263 
    264    indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
    265    indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
    266    if(hasMappings && data.cesLength != 0) {
    267        U_ASSERT(((headerSize + totalSize) & 7) == 0);
    268        totalSize += data.cesLength * 8;
    269    }
    270 
    271    indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
    272    indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
    273    if(hasMappings) {
    274        totalSize += data.ce32sLength * 4;
    275    }
    276 
    277    indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
    278    totalSize += rootElementsLength * 4;
    279 
    280    indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
    281    if(hasMappings) {
    282        totalSize += data.contextsLength * 2;
    283    }
    284 
    285    indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
    286    if(hasMappings && !unsafeBackwardSet.isEmpty()) {
    287        UErrorCode errorCode2 = U_ZERO_ERROR;
    288        int32_t length;
    289        if(totalSize < capacity) {
    290            uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
    291            length = unsafeBackwardSet.serialize(
    292                    p, (capacity - totalSize) / 2, errorCode2);
    293        } else {
    294            length = unsafeBackwardSet.serialize(nullptr, 0, errorCode2);
    295        }
    296        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
    297            errorCode = errorCode2;
    298            return 0;
    299        }
    300        totalSize += length * 2;
    301    }
    302 
    303    indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
    304    totalSize += fastLatinTableLength * 2;
    305 
    306    UnicodeString scripts;
    307    indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
    308    if(isBase) {
    309        scripts.append(static_cast<char16_t>(data.numScripts));
    310        scripts.append(reinterpret_cast<const char16_t *>(data.scriptsIndex), data.numScripts + 16);
    311        scripts.append(reinterpret_cast<const char16_t *>(data.scriptStarts), data.scriptStartsLength);
    312        totalSize += scripts.length() * 2;
    313    }
    314 
    315    indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
    316    if(isBase) {
    317        totalSize += 256;
    318    }
    319 
    320    indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
    321    indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
    322 
    323    if(totalSize > capacity) {
    324        errorCode = U_BUFFER_OVERFLOW_ERROR;
    325        return headerSize + totalSize;
    326    }
    327 
    328    uprv_memcpy(dest, indexes, indexesLength * 4);
    329    copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
    330    copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
    331    // The trie has already been serialized into the dest buffer.
    332    copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
    333    copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
    334    copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
    335    copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
    336    // The unsafeBackwardSet has already been serialized into the dest buffer.
    337    copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
    338    copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
    339    copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
    340 
    341    return headerSize + totalSize;
    342 }
    343 
    344 void
    345 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
    346                              const void *src, uint8_t *dest) {
    347    int32_t start = indexes[startIndex];
    348    int32_t limit = indexes[startIndex + 1];
    349    if(start < limit) {
    350        uprv_memcpy(dest + start, src, limit - start);
    351    }
    352 }
    353 
    354 U_NAMESPACE_END
    355 
    356 #endif  // !UCONFIG_NO_COLLATION