tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationdata.cpp (13501B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2012-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationdata.cpp
      9 *
     10 * created on: 2012jul28
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "unicode/udata.h"
     20 #include "unicode/uscript.h"
     21 #include "cmemory.h"
     22 #include "collation.h"
     23 #include "collationdata.h"
     24 #include "uassert.h"
     25 #include "utrie2.h"
     26 #include "uvectr32.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 uint32_t
     31 CollationData::getIndirectCE32(uint32_t ce32) const {
     32    U_ASSERT(Collation::isSpecialCE32(ce32));
     33    int32_t tag = Collation::tagFromCE32(ce32);
     34    if(tag == Collation::DIGIT_TAG) {
     35        // Fetch the non-numeric-collation CE32.
     36        ce32 = ce32s[Collation::indexFromCE32(ce32)];
     37    } else if(tag == Collation::LEAD_SURROGATE_TAG) {
     38        ce32 = Collation::UNASSIGNED_CE32;
     39    } else if(tag == Collation::U0000_TAG) {
     40        // Fetch the normal ce32 for U+0000.
     41        ce32 = ce32s[0];
     42    }
     43    return ce32;
     44 }
     45 
     46 uint32_t
     47 CollationData::getFinalCE32(uint32_t ce32) const {
     48    if(Collation::isSpecialCE32(ce32)) {
     49        ce32 = getIndirectCE32(ce32);
     50    }
     51    return ce32;
     52 }
     53 
     54 int64_t
     55 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
     56    if(U_FAILURE(errorCode)) { return 0; }
     57    // Keep parallel with CollationDataBuilder::getSingleCE().
     58    const CollationData *d;
     59    uint32_t ce32 = getCE32(c);
     60    if(ce32 == Collation::FALLBACK_CE32) {
     61        d = base;
     62        ce32 = base->getCE32(c);
     63    } else {
     64        d = this;
     65    }
     66    while(Collation::isSpecialCE32(ce32)) {
     67        switch(Collation::tagFromCE32(ce32)) {
     68        case Collation::LATIN_EXPANSION_TAG:
     69        case Collation::BUILDER_DATA_TAG:
     70        case Collation::PREFIX_TAG:
     71        case Collation::CONTRACTION_TAG:
     72        case Collation::HANGUL_TAG:
     73        case Collation::LEAD_SURROGATE_TAG:
     74            errorCode = U_UNSUPPORTED_ERROR;
     75            return 0;
     76        case Collation::FALLBACK_TAG:
     77        case Collation::RESERVED_TAG_3:
     78            errorCode = U_INTERNAL_PROGRAM_ERROR;
     79            return 0;
     80        case Collation::LONG_PRIMARY_TAG:
     81            return Collation::ceFromLongPrimaryCE32(ce32);
     82        case Collation::LONG_SECONDARY_TAG:
     83            return Collation::ceFromLongSecondaryCE32(ce32);
     84        case Collation::EXPANSION32_TAG:
     85            if(Collation::lengthFromCE32(ce32) == 1) {
     86                ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
     87                break;
     88            } else {
     89                errorCode = U_UNSUPPORTED_ERROR;
     90                return 0;
     91            }
     92        case Collation::EXPANSION_TAG: {
     93            if(Collation::lengthFromCE32(ce32) == 1) {
     94                return d->ces[Collation::indexFromCE32(ce32)];
     95            } else {
     96                errorCode = U_UNSUPPORTED_ERROR;
     97                return 0;
     98            }
     99        }
    100        case Collation::DIGIT_TAG:
    101            // Fetch the non-numeric-collation CE32 and continue.
    102            ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
    103            break;
    104        case Collation::U0000_TAG:
    105            U_ASSERT(c == 0);
    106            // Fetch the normal ce32 for U+0000 and continue.
    107            ce32 = d->ce32s[0];
    108            break;
    109        case Collation::OFFSET_TAG:
    110            return d->getCEFromOffsetCE32(c, ce32);
    111        case Collation::IMPLICIT_TAG:
    112            return Collation::unassignedCEFromCodePoint(c);
    113        }
    114    }
    115    return Collation::ceFromSimpleCE32(ce32);
    116 }
    117 
    118 uint32_t
    119 CollationData::getFirstPrimaryForGroup(int32_t script) const {
    120    int32_t index = getScriptIndex(script);
    121    return index == 0 ? 0 : static_cast<uint32_t>(scriptStarts[index]) << 16;
    122 }
    123 
    124 uint32_t
    125 CollationData::getLastPrimaryForGroup(int32_t script) const {
    126    int32_t index = getScriptIndex(script);
    127    if(index == 0) {
    128        return 0;
    129    }
    130    uint32_t limit = scriptStarts[index + 1];
    131    return (limit << 16) - 1;
    132 }
    133 
    134 int32_t
    135 CollationData::getGroupForPrimary(uint32_t p) const {
    136    p >>= 16;
    137    if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
    138        return -1;
    139    }
    140    int32_t index = 1;
    141    while(p >= scriptStarts[index + 1]) { ++index; }
    142    for(int32_t i = 0; i < numScripts; ++i) {
    143        if(scriptsIndex[i] == index) {
    144            return i;
    145        }
    146    }
    147    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    148        if(scriptsIndex[numScripts + i] == index) {
    149            return UCOL_REORDER_CODE_FIRST + i;
    150        }
    151    }
    152    return -1;
    153 }
    154 
    155 int32_t
    156 CollationData::getScriptIndex(int32_t script) const {
    157    if(script < 0) {
    158        return 0;
    159    } else if(script < numScripts) {
    160        return scriptsIndex[script];
    161    } else if(script < UCOL_REORDER_CODE_FIRST) {
    162        return 0;
    163    } else {
    164        script -= UCOL_REORDER_CODE_FIRST;
    165        if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
    166            return scriptsIndex[numScripts + script];
    167        } else {
    168            return 0;
    169        }
    170    }
    171 }
    172 
    173 int32_t
    174 CollationData::getEquivalentScripts(int32_t script,
    175                                    int32_t dest[], int32_t capacity,
    176                                    UErrorCode &errorCode) const {
    177    if(U_FAILURE(errorCode)) { return 0; }
    178    int32_t index = getScriptIndex(script);
    179    if(index == 0) { return 0; }
    180    if(script >= UCOL_REORDER_CODE_FIRST) {
    181        // Special groups have no aliases.
    182        if(capacity > 0) {
    183            dest[0] = script;
    184        } else {
    185            errorCode = U_BUFFER_OVERFLOW_ERROR;
    186        }
    187        return 1;
    188    }
    189 
    190    int32_t length = 0;
    191    for(int32_t i = 0; i < numScripts; ++i) {
    192        if(scriptsIndex[i] == index) {
    193            if(length < capacity) {
    194                dest[length] = i;
    195            }
    196            ++length;
    197        }
    198    }
    199    if(length > capacity) {
    200        errorCode = U_BUFFER_OVERFLOW_ERROR;
    201    }
    202    return length;
    203 }
    204 
    205 void
    206 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    207                                 UVector32 &ranges, UErrorCode &errorCode) const {
    208    makeReorderRanges(reorder, length, false, ranges, errorCode);
    209 }
    210 
    211 void
    212 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
    213                                 UBool latinMustMove,
    214                                 UVector32 &ranges, UErrorCode &errorCode) const {
    215    if(U_FAILURE(errorCode)) { return; }
    216    ranges.removeAllElements();
    217    if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
    218        return;
    219    }
    220 
    221    // Maps each script-or-group range to a new lead byte.
    222    uint8_t table[MAX_NUM_SCRIPT_RANGES];
    223    uprv_memset(table, 0, sizeof(table));
    224 
    225    {
    226        // Set "don't care" values for reserved ranges.
    227        int32_t index = scriptsIndex[
    228                numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
    229        if(index != 0) {
    230            table[index] = 0xff;
    231        }
    232        index = scriptsIndex[
    233                numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
    234        if(index != 0) {
    235            table[index] = 0xff;
    236        }
    237    }
    238 
    239    // Never reorder special low and high primary lead bytes.
    240    U_ASSERT(scriptStartsLength >= 2);
    241    U_ASSERT(scriptStarts[0] == 0);
    242    int32_t lowStart = scriptStarts[1];
    243    U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
    244    int32_t highLimit = scriptStarts[scriptStartsLength - 1];
    245    U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
    246 
    247    // Get the set of special reorder codes in the input list.
    248    // This supports a fixed number of special reorder codes;
    249    // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
    250    uint32_t specials = 0;
    251    for(int32_t i = 0; i < length; ++i) {
    252        int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
    253        if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
    254            specials |= static_cast<uint32_t>(1) << reorderCode;
    255        }
    256    }
    257 
    258    // Start the reordering with the special low reorder codes that do not occur in the input.
    259    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    260        int32_t index = scriptsIndex[numScripts + i];
    261        if (index != 0 && (specials & (static_cast<uint32_t>(1) << i)) == 0) {
    262            lowStart = addLowScriptRange(table, index, lowStart);
    263        }
    264    }
    265 
    266    // Skip the reserved range before Latin if Latin is the first script,
    267    // so that we do not move it unnecessarily.
    268    int32_t skippedReserved = 0;
    269    if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
    270        int32_t index = scriptsIndex[USCRIPT_LATIN];
    271        U_ASSERT(index != 0);
    272        int32_t start = scriptStarts[index];
    273        U_ASSERT(lowStart <= start);
    274        skippedReserved = start - lowStart;
    275        lowStart = start;
    276    }
    277 
    278    // Reorder according to the input scripts, continuing from the bottom of the primary range.
    279    int32_t originalLength = length;  // length will be decremented if "others" is in the list.
    280    UBool hasReorderToEnd = false;
    281    for(int32_t i = 0; i < length;) {
    282        int32_t script = reorder[i++];
    283        if(script == USCRIPT_UNKNOWN) {
    284            // Put the remaining scripts at the top.
    285            hasReorderToEnd = true;
    286            while(i < length) {
    287                script = reorder[--length];
    288                if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
    289                        script == UCOL_REORDER_CODE_DEFAULT) {
    290                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    291                    return;
    292                }
    293                int32_t index = getScriptIndex(script);
    294                if(index == 0) { continue; }
    295                if(table[index] != 0) {  // Duplicate or equivalent script.
    296                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    297                    return;
    298                }
    299                highLimit = addHighScriptRange(table, index, highLimit);
    300            }
    301            break;
    302        }
    303        if(script == UCOL_REORDER_CODE_DEFAULT) {
    304            // The default code must be the only one in the list, and that is handled by the caller.
    305            // Otherwise it must not be used.
    306            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    307            return;
    308        }
    309        int32_t index = getScriptIndex(script);
    310        if(index == 0) { continue; }
    311        if(table[index] != 0) {  // Duplicate or equivalent script.
    312            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    313            return;
    314        }
    315        lowStart = addLowScriptRange(table, index, lowStart);
    316    }
    317 
    318    // Put all remaining scripts into the middle.
    319    for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
    320        int32_t leadByte = table[i];
    321        if(leadByte != 0) { continue; }
    322        int32_t start = scriptStarts[i];
    323        if(!hasReorderToEnd && start > lowStart) {
    324            // No need to move this script.
    325            lowStart = start;
    326        }
    327        lowStart = addLowScriptRange(table, i, lowStart);
    328    }
    329    if(lowStart > highLimit) {
    330        if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
    331            // Try not skipping the before-Latin reserved range.
    332            makeReorderRanges(reorder, originalLength, true, ranges, errorCode);
    333            return;
    334        }
    335        // We need more primary lead bytes than available, despite the reserved ranges.
    336        errorCode = U_BUFFER_OVERFLOW_ERROR;
    337        return;
    338    }
    339 
    340    // Turn lead bytes into a list of (limit, offset) pairs.
    341    // Encode each pair in one list element:
    342    // Upper 16 bits = limit, lower 16 = signed lead byte offset.
    343    int32_t offset = 0;
    344    for(int32_t i = 1;; ++i) {
    345        int32_t nextOffset = offset;
    346        while(i < scriptStartsLength - 1) {
    347            int32_t newLeadByte = table[i];
    348            if(newLeadByte == 0xff) {
    349                // "Don't care" lead byte for reserved range, continue with current offset.
    350            } else {
    351                nextOffset = newLeadByte - (scriptStarts[i] >> 8);
    352                if(nextOffset != offset) { break; }
    353            }
    354            ++i;
    355        }
    356        if(offset != 0 || i < scriptStartsLength - 1) {
    357            ranges.addElement((static_cast<int32_t>(scriptStarts[i]) << 16) | (offset & 0xffff), errorCode);
    358        }
    359        if(i == scriptStartsLength - 1) { break; }
    360        offset = nextOffset;
    361    }
    362 }
    363 
    364 int32_t
    365 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
    366    int32_t start = scriptStarts[index];
    367    if((start & 0xff) < (lowStart & 0xff)) {
    368        lowStart += 0x100;
    369    }
    370    table[index] = static_cast<uint8_t>(lowStart >> 8);
    371    int32_t limit = scriptStarts[index + 1];
    372    lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
    373    return lowStart;
    374 }
    375 
    376 int32_t
    377 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
    378    int32_t limit = scriptStarts[index + 1];
    379    if((limit & 0xff) > (highLimit & 0xff)) {
    380        highLimit -= 0x100;
    381    }
    382    int32_t start = scriptStarts[index];
    383    highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
    384    table[index] = static_cast<uint8_t>(highLimit >> 8);
    385    return highLimit;
    386 }
    387 
    388 U_NAMESPACE_END
    389 
    390 #endif  // !UCONFIG_NO_COLLATION