tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationfastlatin.cpp (42861B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationfastlatin.cpp
      9 *
     10 * created on: 2013aug18
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "collationdata.h"
     20 #include "collationfastlatin.h"
     21 #include "collationsettings.h"
     22 #include "uassert.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 int32_t
     27 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
     28                               uint16_t *primaries, int32_t capacity) {
     29    const uint16_t *table = data->fastLatinTable;
     30    if(table == nullptr) { return -1; }
     31    U_ASSERT(capacity == LATIN_LIMIT);
     32    if(capacity != LATIN_LIMIT) { return -1; }
     33 
     34    uint32_t miniVarTop;
     35    if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
     36        // No mini primaries are variable, set a variableTop just below the
     37        // lowest long mini primary.
     38        miniVarTop = MIN_LONG - 1;
     39    } else {
     40        int32_t headerLength = *table & 0xff;
     41        int32_t i = 1 + settings.getMaxVariable();
     42        if(i >= headerLength) {
     43            return -1;  // variableTop >= digits, should not occur
     44        }
     45        miniVarTop = table[i];
     46    }
     47 
     48    UBool digitsAreReordered = false;
     49    if(settings.hasReordering()) {
     50        uint32_t prevStart = 0;
     51        uint32_t beforeDigitStart = 0;
     52        uint32_t digitStart = 0;
     53        uint32_t afterDigitStart = 0;
     54        for(int32_t group = UCOL_REORDER_CODE_FIRST;
     55                group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
     56                ++group) {
     57            uint32_t start = data->getFirstPrimaryForGroup(group);
     58            start = settings.reorder(start);
     59            if(group == UCOL_REORDER_CODE_DIGIT) {
     60                beforeDigitStart = prevStart;
     61                digitStart = start;
     62            } else if(start != 0) {
     63                if(start < prevStart) {
     64                    // The permutation affects the groups up to Latin.
     65                    return -1;
     66                }
     67                // In the future, there might be a special group between digits & Latin.
     68                if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
     69                    afterDigitStart = start;
     70                }
     71                prevStart = start;
     72            }
     73        }
     74        uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
     75        latinStart = settings.reorder(latinStart);
     76        if(latinStart < prevStart) {
     77            return -1;
     78        }
     79        if(afterDigitStart == 0) {
     80            afterDigitStart = latinStart;
     81        }
     82        if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
     83            digitsAreReordered = true;
     84        }
     85    }
     86 
     87    table += (table[0] & 0xff);  // skip the header
     88    for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
     89        uint32_t p = table[c];
     90        if(p >= MIN_SHORT) {
     91            p &= SHORT_PRIMARY_MASK;
     92        } else if(p > miniVarTop) {
     93            p &= LONG_PRIMARY_MASK;
     94        } else {
     95            p = 0;
     96        }
     97        primaries[c] = static_cast<uint16_t>(p);
     98    }
     99    if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
    100        // Bail out for digits.
    101        for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
    102    }
    103 
    104    // Shift the miniVarTop above other options.
    105    return (static_cast<int32_t>(miniVarTop) << 16) | settings.options;
    106 }
    107 
    108 int32_t
    109 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
    110                                 const char16_t *left, int32_t leftLength,
    111                                 const char16_t *right, int32_t rightLength) {
    112    // This is a modified copy of CollationCompare::compareUpToQuaternary(),
    113    // optimized for common Latin text.
    114    // Keep them in sync!
    115    // Keep compareUTF16() and compareUTF8() in sync very closely!
    116 
    117    U_ASSERT((table[0] >> 8) == VERSION);
    118    table += (table[0] & 0xff);  // skip the header
    119    uint32_t variableTop = static_cast<uint32_t>(options) >> 16; // see getOptions()
    120    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    121 
    122    // Check for supported characters, fetch mini CEs, and compare primaries.
    123    int32_t leftIndex = 0, rightIndex = 0;
    124    /**
    125     * Single mini CE or a pair.
    126     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    127     * If there is only one, then it is in the lower bits, and the upper bits are 0.
    128     */
    129    uint32_t leftPair = 0, rightPair = 0;
    130    for(;;) {
    131        // We fetch CEs until we get a non-ignorable primary or reach the end.
    132        while(leftPair == 0) {
    133            if(leftIndex == leftLength) {
    134                leftPair = EOS;
    135                break;
    136            }
    137            UChar32 c = left[leftIndex++];
    138            if(c <= LATIN_MAX) {
    139                leftPair = primaries[c];
    140                if(leftPair != 0) { break; }
    141                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    142                    return BAIL_OUT_RESULT;
    143                }
    144                leftPair = table[c];
    145            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    146                leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    147            } else {
    148                leftPair = lookup(table, c);
    149            }
    150            if(leftPair >= MIN_SHORT) {
    151                leftPair &= SHORT_PRIMARY_MASK;
    152                break;
    153            } else if(leftPair > variableTop) {
    154                leftPair &= LONG_PRIMARY_MASK;
    155                break;
    156            } else {
    157                leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength);
    158                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    159                leftPair = getPrimaries(variableTop, leftPair);
    160            }
    161        }
    162 
    163        while(rightPair == 0) {
    164            if(rightIndex == rightLength) {
    165                rightPair = EOS;
    166                break;
    167            }
    168            UChar32 c = right[rightIndex++];
    169            if(c <= LATIN_MAX) {
    170                rightPair = primaries[c];
    171                if(rightPair != 0) { break; }
    172                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    173                    return BAIL_OUT_RESULT;
    174                }
    175                rightPair = table[c];
    176            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    177                rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    178            } else {
    179                rightPair = lookup(table, c);
    180            }
    181            if(rightPair >= MIN_SHORT) {
    182                rightPair &= SHORT_PRIMARY_MASK;
    183                break;
    184            } else if(rightPair > variableTop) {
    185                rightPair &= LONG_PRIMARY_MASK;
    186                break;
    187            } else {
    188                rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength);
    189                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    190                rightPair = getPrimaries(variableTop, rightPair);
    191            }
    192        }
    193 
    194        if(leftPair == rightPair) {
    195            if(leftPair == EOS) { break; }
    196            leftPair = rightPair = 0;
    197            continue;
    198        }
    199        uint32_t leftPrimary = leftPair & 0xffff;
    200        uint32_t rightPrimary = rightPair & 0xffff;
    201        if(leftPrimary != rightPrimary) {
    202            // Return the primary difference.
    203            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    204        }
    205        if(leftPair == EOS) { break; }
    206        leftPair >>= 16;
    207        rightPair >>= 16;
    208    }
    209    // In the following, we need to re-fetch each character because we did not buffer the CEs,
    210    // but we know that the string is well-formed and
    211    // only contains supported characters and mappings.
    212 
    213    // We might skip the secondary level but continue with the case level
    214    // which is turned on separately.
    215    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    216        leftIndex = rightIndex = 0;
    217        leftPair = rightPair = 0;
    218        for(;;) {
    219            while(leftPair == 0) {
    220                if(leftIndex == leftLength) {
    221                    leftPair = EOS;
    222                    break;
    223                }
    224                UChar32 c = left[leftIndex++];
    225                if(c <= LATIN_MAX) {
    226                    leftPair = table[c];
    227                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    228                    leftPair = table[c - PUNCT_START + LATIN_LIMIT];
    229                } else {
    230                    leftPair = lookup(table, c);
    231                }
    232                if(leftPair >= MIN_SHORT) {
    233                    leftPair = getSecondariesFromOneShortCE(leftPair);
    234                    break;
    235                } else if(leftPair > variableTop) {
    236                    leftPair = COMMON_SEC_PLUS_OFFSET;
    237                    break;
    238                } else {
    239                    leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength);
    240                    leftPair = getSecondaries(variableTop, leftPair);
    241                }
    242            }
    243 
    244            while(rightPair == 0) {
    245                if(rightIndex == rightLength) {
    246                    rightPair = EOS;
    247                    break;
    248                }
    249                UChar32 c = right[rightIndex++];
    250                if(c <= LATIN_MAX) {
    251                    rightPair = table[c];
    252                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    253                    rightPair = table[c - PUNCT_START + LATIN_LIMIT];
    254                } else {
    255                    rightPair = lookup(table, c);
    256                }
    257                if(rightPair >= MIN_SHORT) {
    258                    rightPair = getSecondariesFromOneShortCE(rightPair);
    259                    break;
    260                } else if(rightPair > variableTop) {
    261                    rightPair = COMMON_SEC_PLUS_OFFSET;
    262                    break;
    263                } else {
    264                    rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength);
    265                    rightPair = getSecondaries(variableTop, rightPair);
    266                }
    267            }
    268 
    269            if(leftPair == rightPair) {
    270                if(leftPair == EOS) { break; }
    271                leftPair = rightPair = 0;
    272                continue;
    273            }
    274            uint32_t leftSecondary = leftPair & 0xffff;
    275            uint32_t rightSecondary = rightPair & 0xffff;
    276            if(leftSecondary != rightSecondary) {
    277                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    278                    // Full support for backwards secondary requires backwards contraction matching
    279                    // and moving backwards between merge separators.
    280                    return BAIL_OUT_RESULT;
    281                }
    282                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    283            }
    284            if(leftPair == EOS) { break; }
    285            leftPair >>= 16;
    286            rightPair >>= 16;
    287        }
    288    }
    289 
    290    if((options & CollationSettings::CASE_LEVEL) != 0) {
    291        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    292        leftIndex = rightIndex = 0;
    293        leftPair = rightPair = 0;
    294        for(;;) {
    295            while(leftPair == 0) {
    296                if(leftIndex == leftLength) {
    297                    leftPair = EOS;
    298                    break;
    299                }
    300                UChar32 c = left[leftIndex++];
    301                leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    302                if(leftPair < MIN_LONG) {
    303                    leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength);
    304                }
    305                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    306            }
    307 
    308            while(rightPair == 0) {
    309                if(rightIndex == rightLength) {
    310                    rightPair = EOS;
    311                    break;
    312                }
    313                UChar32 c = right[rightIndex++];
    314                rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    315                if(rightPair < MIN_LONG) {
    316                    rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength);
    317                }
    318                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    319            }
    320 
    321            if(leftPair == rightPair) {
    322                if(leftPair == EOS) { break; }
    323                leftPair = rightPair = 0;
    324                continue;
    325            }
    326            uint32_t leftCase = leftPair & 0xffff;
    327            uint32_t rightCase = rightPair & 0xffff;
    328            if(leftCase != rightCase) {
    329                if((options & CollationSettings::UPPER_FIRST) == 0) {
    330                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    331                } else {
    332                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    333                }
    334            }
    335            if(leftPair == EOS) { break; }
    336            leftPair >>= 16;
    337            rightPair >>= 16;
    338        }
    339    }
    340    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    341 
    342    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    343    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    344 
    345    leftIndex = rightIndex = 0;
    346    leftPair = rightPair = 0;
    347    for(;;) {
    348        while(leftPair == 0) {
    349            if(leftIndex == leftLength) {
    350                leftPair = EOS;
    351                break;
    352            }
    353            UChar32 c = left[leftIndex++];
    354            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    355            if(leftPair < MIN_LONG) {
    356                leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength);
    357            }
    358            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    359        }
    360 
    361        while(rightPair == 0) {
    362            if(rightIndex == rightLength) {
    363                rightPair = EOS;
    364                break;
    365            }
    366            UChar32 c = right[rightIndex++];
    367            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    368            if(rightPair < MIN_LONG) {
    369                rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength);
    370            }
    371            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    372        }
    373 
    374        if(leftPair == rightPair) {
    375            if(leftPair == EOS) { break; }
    376            leftPair = rightPair = 0;
    377            continue;
    378        }
    379        uint32_t leftTertiary = leftPair & 0xffff;
    380        uint32_t rightTertiary = rightPair & 0xffff;
    381        if(leftTertiary != rightTertiary) {
    382            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    383                // Pass through EOS and MERGE_WEIGHT
    384                // and keep real tertiary weights larger than the MERGE_WEIGHT.
    385                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    386                if(leftTertiary > MERGE_WEIGHT) {
    387                    leftTertiary ^= CASE_MASK;
    388                }
    389                if(rightTertiary > MERGE_WEIGHT) {
    390                    rightTertiary ^= CASE_MASK;
    391                }
    392            }
    393            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    394        }
    395        if(leftPair == EOS) { break; }
    396        leftPair >>= 16;
    397        rightPair >>= 16;
    398    }
    399    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    400 
    401    leftIndex = rightIndex = 0;
    402    leftPair = rightPair = 0;
    403    for(;;) {
    404        while(leftPair == 0) {
    405            if(leftIndex == leftLength) {
    406                leftPair = EOS;
    407                break;
    408            }
    409            UChar32 c = left[leftIndex++];
    410            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    411            if(leftPair < MIN_LONG) {
    412                leftPair = nextPair(table, c, leftPair, left, nullptr, leftIndex, leftLength);
    413            }
    414            leftPair = getQuaternaries(variableTop, leftPair);
    415        }
    416 
    417        while(rightPair == 0) {
    418            if(rightIndex == rightLength) {
    419                rightPair = EOS;
    420                break;
    421            }
    422            UChar32 c = right[rightIndex++];
    423            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
    424            if(rightPair < MIN_LONG) {
    425                rightPair = nextPair(table, c, rightPair, right, nullptr, rightIndex, rightLength);
    426            }
    427            rightPair = getQuaternaries(variableTop, rightPair);
    428        }
    429 
    430        if(leftPair == rightPair) {
    431            if(leftPair == EOS) { break; }
    432            leftPair = rightPair = 0;
    433            continue;
    434        }
    435        uint32_t leftQuaternary = leftPair & 0xffff;
    436        uint32_t rightQuaternary = rightPair & 0xffff;
    437        if(leftQuaternary != rightQuaternary) {
    438            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    439        }
    440        if(leftPair == EOS) { break; }
    441        leftPair >>= 16;
    442        rightPair >>= 16;
    443    }
    444    return UCOL_EQUAL;
    445 }
    446 
    447 int32_t
    448 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
    449                                 const uint8_t *left, int32_t leftLength,
    450                                 const uint8_t *right, int32_t rightLength) {
    451    // Keep compareUTF16() and compareUTF8() in sync very closely!
    452 
    453    U_ASSERT((table[0] >> 8) == VERSION);
    454    table += (table[0] & 0xff);  // skip the header
    455    uint32_t variableTop = static_cast<uint32_t>(options) >> 16; // see RuleBasedCollator::getFastLatinOptions()
    456    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
    457 
    458    // Check for supported characters, fetch mini CEs, and compare primaries.
    459    int32_t leftIndex = 0, rightIndex = 0;
    460    /**
    461     * Single mini CE or a pair.
    462     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
    463     * If there is only one, then it is in the lower bits, and the upper bits are 0.
    464     */
    465    uint32_t leftPair = 0, rightPair = 0;
    466    // Note: There is no need to assemble the code point.
    467    // We only need to look up the table entry for the character,
    468    // and nextPair() looks for whether c==0.
    469    for(;;) {
    470        // We fetch CEs until we get a non-ignorable primary or reach the end.
    471        while(leftPair == 0) {
    472            if(leftIndex == leftLength) {
    473                leftPair = EOS;
    474                break;
    475            }
    476            UChar32 c = left[leftIndex++];
    477            uint8_t t;
    478            if(c <= 0x7f) {
    479                leftPair = primaries[c];
    480                if(leftPair != 0) { break; }
    481                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    482                    return BAIL_OUT_RESULT;
    483                }
    484                leftPair = table[c];
    485            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
    486                    0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
    487                ++leftIndex;
    488                c = ((c - 0xc2) << 6) + t;
    489                leftPair = primaries[c];
    490                if(leftPair != 0) { break; }
    491                leftPair = table[c];
    492            } else {
    493                leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
    494            }
    495            if(leftPair >= MIN_SHORT) {
    496                leftPair &= SHORT_PRIMARY_MASK;
    497                break;
    498            } else if(leftPair > variableTop) {
    499                leftPair &= LONG_PRIMARY_MASK;
    500                break;
    501            } else {
    502                leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength);
    503                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    504                leftPair = getPrimaries(variableTop, leftPair);
    505            }
    506        }
    507 
    508        while(rightPair == 0) {
    509            if(rightIndex == rightLength) {
    510                rightPair = EOS;
    511                break;
    512            }
    513            UChar32 c = right[rightIndex++];
    514            uint8_t t;
    515            if(c <= 0x7f) {
    516                rightPair = primaries[c];
    517                if(rightPair != 0) { break; }
    518                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
    519                    return BAIL_OUT_RESULT;
    520                }
    521                rightPair = table[c];
    522            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
    523                    0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
    524                ++rightIndex;
    525                c = ((c - 0xc2) << 6) + t;
    526                rightPair = primaries[c];
    527                if(rightPair != 0) { break; }
    528                rightPair = table[c];
    529            } else {
    530                rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
    531            }
    532            if(rightPair >= MIN_SHORT) {
    533                rightPair &= SHORT_PRIMARY_MASK;
    534                break;
    535            } else if(rightPair > variableTop) {
    536                rightPair &= LONG_PRIMARY_MASK;
    537                break;
    538            } else {
    539                rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength);
    540                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
    541                rightPair = getPrimaries(variableTop, rightPair);
    542            }
    543        }
    544 
    545        if(leftPair == rightPair) {
    546            if(leftPair == EOS) { break; }
    547            leftPair = rightPair = 0;
    548            continue;
    549        }
    550        uint32_t leftPrimary = leftPair & 0xffff;
    551        uint32_t rightPrimary = rightPair & 0xffff;
    552        if(leftPrimary != rightPrimary) {
    553            // Return the primary difference.
    554            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    555        }
    556        if(leftPair == EOS) { break; }
    557        leftPair >>= 16;
    558        rightPair >>= 16;
    559    }
    560    // In the following, we need to re-fetch each character because we did not buffer the CEs,
    561    // but we know that the string is well-formed and
    562    // only contains supported characters and mappings.
    563 
    564    // We might skip the secondary level but continue with the case level
    565    // which is turned on separately.
    566    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    567        leftIndex = rightIndex = 0;
    568        leftPair = rightPair = 0;
    569        for(;;) {
    570            while(leftPair == 0) {
    571                if(leftIndex == leftLength) {
    572                    leftPair = EOS;
    573                    break;
    574                }
    575                UChar32 c = left[leftIndex++];
    576                if(c <= 0x7f) {
    577                    leftPair = table[c];
    578                } else if(c <= LATIN_MAX_UTF8_LEAD) {
    579                    leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
    580                } else {
    581                    leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
    582                }
    583                if(leftPair >= MIN_SHORT) {
    584                    leftPair = getSecondariesFromOneShortCE(leftPair);
    585                    break;
    586                } else if(leftPair > variableTop) {
    587                    leftPair = COMMON_SEC_PLUS_OFFSET;
    588                    break;
    589                } else {
    590                    leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength);
    591                    leftPair = getSecondaries(variableTop, leftPair);
    592                }
    593            }
    594 
    595            while(rightPair == 0) {
    596                if(rightIndex == rightLength) {
    597                    rightPair = EOS;
    598                    break;
    599                }
    600                UChar32 c = right[rightIndex++];
    601                if(c <= 0x7f) {
    602                    rightPair = table[c];
    603                } else if(c <= LATIN_MAX_UTF8_LEAD) {
    604                    rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
    605                } else {
    606                    rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
    607                }
    608                if(rightPair >= MIN_SHORT) {
    609                    rightPair = getSecondariesFromOneShortCE(rightPair);
    610                    break;
    611                } else if(rightPair > variableTop) {
    612                    rightPair = COMMON_SEC_PLUS_OFFSET;
    613                    break;
    614                } else {
    615                    rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength);
    616                    rightPair = getSecondaries(variableTop, rightPair);
    617                }
    618            }
    619 
    620            if(leftPair == rightPair) {
    621                if(leftPair == EOS) { break; }
    622                leftPair = rightPair = 0;
    623                continue;
    624            }
    625            uint32_t leftSecondary = leftPair & 0xffff;
    626            uint32_t rightSecondary = rightPair & 0xffff;
    627            if(leftSecondary != rightSecondary) {
    628                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
    629                    // Full support for backwards secondary requires backwards contraction matching
    630                    // and moving backwards between merge separators.
    631                    return BAIL_OUT_RESULT;
    632                }
    633                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    634            }
    635            if(leftPair == EOS) { break; }
    636            leftPair >>= 16;
    637            rightPair >>= 16;
    638        }
    639    }
    640 
    641    if((options & CollationSettings::CASE_LEVEL) != 0) {
    642        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
    643        leftIndex = rightIndex = 0;
    644        leftPair = rightPair = 0;
    645        for(;;) {
    646            while(leftPair == 0) {
    647                if(leftIndex == leftLength) {
    648                    leftPair = EOS;
    649                    break;
    650                }
    651                UChar32 c = left[leftIndex++];
    652                leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    653                if(leftPair < MIN_LONG) {
    654                    leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength);
    655                }
    656                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
    657            }
    658 
    659            while(rightPair == 0) {
    660                if(rightIndex == rightLength) {
    661                    rightPair = EOS;
    662                    break;
    663                }
    664                UChar32 c = right[rightIndex++];
    665                rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    666                if(rightPair < MIN_LONG) {
    667                    rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength);
    668                }
    669                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
    670            }
    671 
    672            if(leftPair == rightPair) {
    673                if(leftPair == EOS) { break; }
    674                leftPair = rightPair = 0;
    675                continue;
    676            }
    677            uint32_t leftCase = leftPair & 0xffff;
    678            uint32_t rightCase = rightPair & 0xffff;
    679            if(leftCase != rightCase) {
    680                if((options & CollationSettings::UPPER_FIRST) == 0) {
    681                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    682                } else {
    683                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    684                }
    685            }
    686            if(leftPair == EOS) { break; }
    687            leftPair >>= 16;
    688            rightPair >>= 16;
    689        }
    690    }
    691    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    692 
    693    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
    694    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
    695 
    696    leftIndex = rightIndex = 0;
    697    leftPair = rightPair = 0;
    698    for(;;) {
    699        while(leftPair == 0) {
    700            if(leftIndex == leftLength) {
    701                leftPair = EOS;
    702                break;
    703            }
    704            UChar32 c = left[leftIndex++];
    705            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    706            if(leftPair < MIN_LONG) {
    707                leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength);
    708            }
    709            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
    710        }
    711 
    712        while(rightPair == 0) {
    713            if(rightIndex == rightLength) {
    714                rightPair = EOS;
    715                break;
    716            }
    717            UChar32 c = right[rightIndex++];
    718            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    719            if(rightPair < MIN_LONG) {
    720                rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength);
    721            }
    722            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
    723        }
    724 
    725        if(leftPair == rightPair) {
    726            if(leftPair == EOS) { break; }
    727            leftPair = rightPair = 0;
    728            continue;
    729        }
    730        uint32_t leftTertiary = leftPair & 0xffff;
    731        uint32_t rightTertiary = rightPair & 0xffff;
    732        if(leftTertiary != rightTertiary) {
    733            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    734                // Pass through EOS and MERGE_WEIGHT
    735                // and keep real tertiary weights larger than the MERGE_WEIGHT.
    736                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    737                if(leftTertiary > MERGE_WEIGHT) {
    738                    leftTertiary ^= CASE_MASK;
    739                }
    740                if(rightTertiary > MERGE_WEIGHT) {
    741                    rightTertiary ^= CASE_MASK;
    742                }
    743            }
    744            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    745        }
    746        if(leftPair == EOS) { break; }
    747        leftPair >>= 16;
    748        rightPair >>= 16;
    749    }
    750    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    751 
    752    leftIndex = rightIndex = 0;
    753    leftPair = rightPair = 0;
    754    for(;;) {
    755        while(leftPair == 0) {
    756            if(leftIndex == leftLength) {
    757                leftPair = EOS;
    758                break;
    759            }
    760            UChar32 c = left[leftIndex++];
    761            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
    762            if(leftPair < MIN_LONG) {
    763                leftPair = nextPair(table, c, leftPair, nullptr, left, leftIndex, leftLength);
    764            }
    765            leftPair = getQuaternaries(variableTop, leftPair);
    766        }
    767 
    768        while(rightPair == 0) {
    769            if(rightIndex == rightLength) {
    770                rightPair = EOS;
    771                break;
    772            }
    773            UChar32 c = right[rightIndex++];
    774            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
    775            if(rightPair < MIN_LONG) {
    776                rightPair = nextPair(table, c, rightPair, nullptr, right, rightIndex, rightLength);
    777            }
    778            rightPair = getQuaternaries(variableTop, rightPair);
    779        }
    780 
    781        if(leftPair == rightPair) {
    782            if(leftPair == EOS) { break; }
    783            leftPair = rightPair = 0;
    784            continue;
    785        }
    786        uint32_t leftQuaternary = leftPair & 0xffff;
    787        uint32_t rightQuaternary = rightPair & 0xffff;
    788        if(leftQuaternary != rightQuaternary) {
    789            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    790        }
    791        if(leftPair == EOS) { break; }
    792        leftPair >>= 16;
    793        rightPair >>= 16;
    794    }
    795    return UCOL_EQUAL;
    796 }
    797 
    798 uint32_t
    799 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
    800    U_ASSERT(c > LATIN_MAX);
    801    if(PUNCT_START <= c && c < PUNCT_LIMIT) {
    802        return table[c - PUNCT_START + LATIN_LIMIT];
    803    } else if(c == 0xfffe) {
    804        return MERGE_WEIGHT;
    805    } else if(c == 0xffff) {
    806        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
    807    } else {
    808        return BAIL_OUT;
    809    }
    810 }
    811 
    812 uint32_t
    813 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
    814                               const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
    815    // The caller handled ASCII and valid/supported Latin.
    816    U_ASSERT(c > 0x7f);
    817    int32_t i2 = sIndex + 1;
    818    if(i2 < sLength || sLength < 0) {
    819        uint8_t t1 = s8[sIndex];
    820        uint8_t t2 = s8[i2];
    821        sIndex += 2;
    822        if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
    823            return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    824        } else if(c == 0xef && t1 == 0xbf) {
    825            if(t2 == 0xbe) {
    826                return MERGE_WEIGHT;  // U+FFFE
    827            } else if(t2 == 0xbf) {
    828                return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    829            }
    830        }
    831    }
    832    return BAIL_OUT;
    833 }
    834 
    835 uint32_t
    836 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
    837                                     const uint8_t *s8, int32_t &sIndex) {
    838    // The caller handled ASCII.
    839    // The string is well-formed and contains only supported characters.
    840    U_ASSERT(c > 0x7f);
    841    if(c <= LATIN_MAX_UTF8_LEAD) {
    842        return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
    843    }
    844    uint8_t t2 = s8[sIndex + 1];
    845    sIndex += 2;
    846    if(c == 0xe2) {
    847        return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
    848    } else if(t2 == 0xbe) {
    849        return MERGE_WEIGHT;  // U+FFFE
    850    } else {
    851        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
    852    }
    853 }
    854 
    855 uint32_t
    856 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
    857                             const char16_t *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
    858    if(ce >= MIN_LONG || ce < CONTRACTION) {
    859        return ce;  // simple or special mini CE
    860    } else if(ce >= EXPANSION) {
    861        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    862        return (static_cast<uint32_t>(table[index + 1]) << 16) | table[index];
    863    } else /* ce >= CONTRACTION */ {
    864        if(c == 0 && sLength < 0) {
    865            sLength = sIndex - 1;
    866            return EOS;
    867        }
    868        // Contraction list: Default mapping followed by
    869        // 0 or more single-character contraction suffix mappings.
    870        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
    871        if(sIndex != sLength) {
    872            // Read the next character.
    873            int32_t c2;
    874            int32_t nextIndex = sIndex;
    875            if(s16 != nullptr) {
    876                c2 = s16[nextIndex++];
    877                if(c2 > LATIN_MAX) {
    878                    if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
    879                        c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
    880                    } else if(c2 == 0xfffe || c2 == 0xffff) {
    881                        c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    882                    } else {
    883                        return BAIL_OUT;
    884                    }
    885                }
    886            } else {
    887                c2 = s8[nextIndex++];
    888                if(c2 > 0x7f) {
    889                    uint8_t t;
    890                    if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
    891                            0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
    892                        c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
    893                        ++nextIndex;
    894                    } else {
    895                        int32_t i2 = nextIndex + 1;
    896                        if(i2 < sLength || sLength < 0) {
    897                            if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
    898                                    0x80 <= (t = s8[i2]) && t <= 0xbf) {
    899                                c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
    900                            } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
    901                                    ((t = s8[i2]) == 0xbe || t == 0xbf)) {
    902                                c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
    903                            } else {
    904                                return BAIL_OUT;
    905                            }
    906                        } else {
    907                            return BAIL_OUT;
    908                        }
    909                        nextIndex += 2;
    910                    }
    911                }
    912            }
    913            if(c2 == 0 && sLength < 0) {
    914                sLength = sIndex;
    915                c2 = -1;
    916            }
    917            // Look for the next character in the contraction suffix list,
    918            // which is in ascending order of single suffix characters.
    919            int32_t i = index;
    920            int32_t head = table[i];  // first skip the default mapping
    921            int32_t x;
    922            do {
    923                i += head >> CONTR_LENGTH_SHIFT;
    924                head = table[i];
    925                x = head & CONTR_CHAR_MASK;
    926            } while(x < c2);
    927            if(x == c2) {
    928                index = i;
    929                sIndex = nextIndex;
    930            }
    931        }
    932        // Return the CE or CEs for the default or contraction mapping.
    933        int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
    934        if(length == 1) {
    935            return BAIL_OUT;
    936        }
    937        ce = table[index + 1];
    938        if(length == 2) {
    939            return ce;
    940        } else {
    941            return (static_cast<uint32_t>(table[index + 2]) << 16) | ce;
    942        }
    943    }
    944 }
    945 
    946 uint32_t
    947 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
    948    if(pair <= 0xffff) {
    949        // one mini CE
    950        if(pair >= MIN_SHORT) {
    951            pair = getSecondariesFromOneShortCE(pair);
    952        } else if(pair > variableTop) {
    953            pair = COMMON_SEC_PLUS_OFFSET;
    954        } else if(pair >= MIN_LONG) {
    955            pair = 0;  // variable
    956        }
    957        // else special mini CE
    958    } else {
    959        uint32_t ce = pair & 0xffff;
    960        if(ce >= MIN_SHORT) {
    961            pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
    962        } else if(ce > variableTop) {
    963            pair = TWO_COMMON_SEC_PLUS_OFFSET;
    964        } else {
    965            U_ASSERT(ce >= MIN_LONG);
    966            pair = 0;  // variable
    967        }
    968    }
    969    return pair;
    970 }
    971 
    972 uint32_t
    973 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
    974    // Primary+caseLevel: Ignore case level weights of primary ignorables.
    975    // Otherwise: Ignore case level weights of secondary ignorables.
    976    // For details see the comments in the CollationCompare class.
    977    // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
    978    if(pair <= 0xffff) {
    979        // one mini CE
    980        if(pair >= MIN_SHORT) {
    981            // A high secondary weight means we really have two CEs,
    982            // a primary CE and a secondary CE.
    983            uint32_t ce = pair;
    984            pair &= CASE_MASK;  // explicit weight of primary CE
    985            if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
    986                pair |= LOWER_CASE << 16;  // implied weight of secondary CE
    987            }
    988        } else if(pair > variableTop) {
    989            pair = LOWER_CASE;
    990        } else if(pair >= MIN_LONG) {
    991            pair = 0;  // variable
    992        }
    993        // else special mini CE
    994    } else {
    995        // two mini CEs, same primary groups, neither expands like above
    996        uint32_t ce = pair & 0xffff;
    997        if(ce >= MIN_SHORT) {
    998            if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
    999                pair &= CASE_MASK;
   1000            } else {
   1001                pair &= TWO_CASES_MASK;
   1002            }
   1003        } else if(ce > variableTop) {
   1004            pair = TWO_LOWER_CASES;
   1005        } else {
   1006            U_ASSERT(ce >= MIN_LONG);
   1007            pair = 0;  // variable
   1008        }
   1009    }
   1010    return pair;
   1011 }
   1012 
   1013 uint32_t
   1014 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
   1015    if(pair <= 0xffff) {
   1016        // one mini CE
   1017        if(pair >= MIN_SHORT) {
   1018            // A high secondary weight means we really have two CEs,
   1019            // a primary CE and a secondary CE.
   1020            uint32_t ce = pair;
   1021            if(withCaseBits) {
   1022                pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
   1023                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1024                    pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
   1025                }
   1026            } else {
   1027                pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1028                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1029                    pair |= COMMON_TER_PLUS_OFFSET << 16;
   1030                }
   1031            }
   1032        } else if(pair > variableTop) {
   1033            pair = (pair & TERTIARY_MASK) + TER_OFFSET;
   1034            if(withCaseBits) {
   1035                pair |= LOWER_CASE;
   1036            }
   1037        } else if(pair >= MIN_LONG) {
   1038            pair = 0;  // variable
   1039        }
   1040        // else special mini CE
   1041    } else {
   1042        // two mini CEs, same primary groups, neither expands like above
   1043        uint32_t ce = pair & 0xffff;
   1044        if(ce >= MIN_SHORT) {
   1045            if(withCaseBits) {
   1046                pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
   1047            } else {
   1048                pair &= TWO_TERTIARIES_MASK;
   1049            }
   1050            pair += TWO_TER_OFFSETS;
   1051        } else if(ce > variableTop) {
   1052            pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
   1053            if(withCaseBits) {
   1054                pair |= TWO_LOWER_CASES;
   1055            }
   1056        } else {
   1057            U_ASSERT(ce >= MIN_LONG);
   1058            pair = 0;  // variable
   1059        }
   1060    }
   1061    return pair;
   1062 }
   1063 
   1064 uint32_t
   1065 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
   1066    // Return the primary weight of a variable CE,
   1067    // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
   1068    if(pair <= 0xffff) {
   1069        // one mini CE
   1070        if(pair >= MIN_SHORT) {
   1071            // A high secondary weight means we really have two CEs,
   1072            // a primary CE and a secondary CE.
   1073            if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
   1074                pair = TWO_SHORT_PRIMARIES_MASK;
   1075            } else {
   1076                pair = SHORT_PRIMARY_MASK;
   1077            }
   1078        } else if(pair > variableTop) {
   1079            pair = SHORT_PRIMARY_MASK;
   1080        } else if(pair >= MIN_LONG) {
   1081            pair &= LONG_PRIMARY_MASK;  // variable
   1082        }
   1083        // else special mini CE
   1084    } else {
   1085        // two mini CEs, same primary groups, neither expands like above
   1086        uint32_t ce = pair & 0xffff;
   1087        if(ce > variableTop) {
   1088            pair = TWO_SHORT_PRIMARIES_MASK;
   1089        } else {
   1090            U_ASSERT(ce >= MIN_LONG);
   1091            pair &= TWO_LONG_PRIMARIES_MASK;  // variable
   1092        }
   1093    }
   1094    return pair;
   1095 }
   1096 
   1097 U_NAMESPACE_END
   1098 
   1099 #endif  // !UCONFIG_NO_COLLATION