tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

collationcompare.cpp (15286B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 1996-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * collationcompare.cpp
      9 *
     10 * created on: 2012feb14 with new and old collation code
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationcompare.h"
     22 #include "collationiterator.h"
     23 #include "collationsettings.h"
     24 #include "uassert.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 UCollationResult
     29 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
     30                                        const CollationSettings &settings,
     31                                        UErrorCode &errorCode) {
     32    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
     33 
     34    int32_t options = settings.options;
     35    uint32_t variableTop;
     36    if((options & CollationSettings::ALTERNATE_MASK) == 0) {
     37        variableTop = 0;
     38    } else {
     39        // +1 so that we can use "<" and primary ignorables test out early.
     40        variableTop = settings.variableTop + 1;
     41    }
     42    UBool anyVariable = false;
     43 
     44    // Fetch CEs, compare primaries, store secondary & tertiary weights.
     45    for(;;) {
     46        // We fetch CEs until we get a non-ignorable primary or reach the end.
     47        uint32_t leftPrimary;
     48        do {
     49            int64_t ce = left.nextCE(errorCode);
     50            leftPrimary = static_cast<uint32_t>(ce >> 32);
     51            if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     52                // Variable CE, shift it to quaternary level.
     53                // Ignore all following primary ignorables, and shift further variable CEs.
     54                anyVariable = true;
     55                do {
     56                    // Store only the primary of the variable CE.
     57                    left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     58                    for(;;) {
     59                        ce = left.nextCE(errorCode);
     60                        leftPrimary = static_cast<uint32_t>(ce >> 32);
     61                        if(leftPrimary == 0) {
     62                            left.setCurrentCE(0);
     63                        } else {
     64                            break;
     65                        }
     66                    }
     67                } while(leftPrimary < variableTop &&
     68                        leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     69            }
     70        } while(leftPrimary == 0);
     71 
     72        uint32_t rightPrimary;
     73        do {
     74            int64_t ce = right.nextCE(errorCode);
     75            rightPrimary = static_cast<uint32_t>(ce >> 32);
     76            if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
     77                // Variable CE, shift it to quaternary level.
     78                // Ignore all following primary ignorables, and shift further variable CEs.
     79                anyVariable = true;
     80                do {
     81                    // Store only the primary of the variable CE.
     82                    right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
     83                    for(;;) {
     84                        ce = right.nextCE(errorCode);
     85                        rightPrimary = static_cast<uint32_t>(ce >> 32);
     86                        if(rightPrimary == 0) {
     87                            right.setCurrentCE(0);
     88                        } else {
     89                            break;
     90                        }
     91                    }
     92                } while(rightPrimary < variableTop &&
     93                        rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
     94            }
     95        } while(rightPrimary == 0);
     96 
     97        if(leftPrimary != rightPrimary) {
     98            // Return the primary difference, with script reordering.
     99            if(settings.hasReordering()) {
    100                leftPrimary = settings.reorder(leftPrimary);
    101                rightPrimary = settings.reorder(rightPrimary);
    102            }
    103            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
    104        }
    105        if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
    106    }
    107    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
    108 
    109    // Compare the buffered secondary & tertiary weights.
    110    // We might skip the secondary level but continue with the case level
    111    // which is turned on separately.
    112    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
    113        if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
    114            int32_t leftIndex = 0;
    115            int32_t rightIndex = 0;
    116            for(;;) {
    117                uint32_t leftSecondary;
    118                do {
    119                    leftSecondary = static_cast<uint32_t>(left.getCE(leftIndex++)) >> 16;
    120                } while(leftSecondary == 0);
    121 
    122                uint32_t rightSecondary;
    123                do {
    124                    rightSecondary = static_cast<uint32_t>(right.getCE(rightIndex++)) >> 16;
    125                } while(rightSecondary == 0);
    126 
    127                if(leftSecondary != rightSecondary) {
    128                    return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    129                }
    130                if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
    131            }
    132        } else {
    133            // The backwards secondary level compares secondary weights backwards
    134            // within segments separated by the merge separator (U+FFFE, weight 02).
    135            int32_t leftStart = 0;
    136            int32_t rightStart = 0;
    137            for(;;) {
    138                // Find the merge separator or the NO_CE terminator.
    139                uint32_t p;
    140                int32_t leftLimit = leftStart;
    141                while ((p = static_cast<uint32_t>(left.getCE(leftLimit) >> 32)) >
    142                            Collation::MERGE_SEPARATOR_PRIMARY ||
    143                        p == 0) {
    144                    ++leftLimit;
    145                }
    146                int32_t rightLimit = rightStart;
    147                while ((p = static_cast<uint32_t>(right.getCE(rightLimit) >> 32)) >
    148                            Collation::MERGE_SEPARATOR_PRIMARY ||
    149                        p == 0) {
    150                    ++rightLimit;
    151                }
    152 
    153                // Compare the segments.
    154                int32_t leftIndex = leftLimit;
    155                int32_t rightIndex = rightLimit;
    156                for(;;) {
    157                    int32_t leftSecondary = 0;
    158                    while(leftSecondary == 0 && leftIndex > leftStart) {
    159                        leftSecondary = static_cast<uint32_t>(left.getCE(--leftIndex)) >> 16;
    160                    }
    161 
    162                    int32_t rightSecondary = 0;
    163                    while(rightSecondary == 0 && rightIndex > rightStart) {
    164                        rightSecondary = static_cast<uint32_t>(right.getCE(--rightIndex)) >> 16;
    165                    }
    166 
    167                    if(leftSecondary != rightSecondary) {
    168                        return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
    169                    }
    170                    if(leftSecondary == 0) { break; }
    171                }
    172 
    173                // Did we reach the end of either string?
    174                // Both strings have the same number of merge separators,
    175                // or else there would have been a primary-level difference.
    176                U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
    177                if(p == Collation::NO_CE_PRIMARY) { break; }
    178                // Skip both merge separators and continue.
    179                leftStart = leftLimit + 1;
    180                rightStart = rightLimit + 1;
    181            }
    182        }
    183    }
    184 
    185    if((options & CollationSettings::CASE_LEVEL) != 0) {
    186        int32_t strength = CollationSettings::getStrength(options);
    187        int32_t leftIndex = 0;
    188        int32_t rightIndex = 0;
    189        for(;;) {
    190            uint32_t leftCase, leftLower32, rightCase;
    191            if(strength == UCOL_PRIMARY) {
    192                // Primary+caseLevel: Ignore case level weights of primary ignorables.
    193                // Otherwise we would get a-umlaut > a
    194                // which is not desirable for accent-insensitive sorting.
    195                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
    196                // with only primary weights.
    197                int64_t ce;
    198                do {
    199                    ce = left.getCE(leftIndex++);
    200                    leftCase = static_cast<uint32_t>(ce);
    201                } while (static_cast<uint32_t>(ce >> 32) == 0 || leftCase == 0);
    202                leftLower32 = leftCase;
    203                leftCase &= 0xc000;
    204 
    205                do {
    206                    ce = right.getCE(rightIndex++);
    207                    rightCase = static_cast<uint32_t>(ce);
    208                } while (static_cast<uint32_t>(ce >> 32) == 0 || rightCase == 0);
    209                rightCase &= 0xc000;
    210            } else {
    211                // Secondary+caseLevel: By analogy with the above,
    212                // ignore case level weights of secondary ignorables.
    213                //
    214                // Note: A tertiary CE has uppercase case bits (0.0.ut)
    215                // to keep tertiary+caseFirst well-formed.
    216                //
    217                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
    218                // Otherwise a tertiary CE's uppercase would be no greater than
    219                // a primary/secondary CE's uppercase.
    220                // (See UCA well-formedness condition 2.)
    221                // We could construct a special case weight higher than uppercase,
    222                // but it's simpler to always ignore case weights of secondary ignorables,
    223                // turning 0.0.ut into 0.0.0.t.
    224                // (See LDML Collation, Case Parameters.)
    225                do {
    226                    leftCase = static_cast<uint32_t>(left.getCE(leftIndex++));
    227                } while(leftCase <= 0xffff);
    228                leftLower32 = leftCase;
    229                leftCase &= 0xc000;
    230 
    231                do {
    232                    rightCase = static_cast<uint32_t>(right.getCE(rightIndex++));
    233                } while(rightCase <= 0xffff);
    234                rightCase &= 0xc000;
    235            }
    236 
    237            // No need to handle NO_CE and MERGE_SEPARATOR specially:
    238            // There is one case weight for each previous-level weight,
    239            // so level length differences were handled there.
    240            if(leftCase != rightCase) {
    241                if((options & CollationSettings::UPPER_FIRST) == 0) {
    242                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
    243                } else {
    244                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
    245                }
    246            }
    247            if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
    248        }
    249    }
    250    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
    251 
    252    uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
    253 
    254    int32_t leftIndex = 0;
    255    int32_t rightIndex = 0;
    256    uint32_t anyQuaternaries = 0;
    257    for(;;) {
    258        uint32_t leftLower32, leftTertiary;
    259        do {
    260            leftLower32 = static_cast<uint32_t>(left.getCE(leftIndex++));
    261            anyQuaternaries |= leftLower32;
    262            U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    263                     (leftLower32 & 0xc0c0) == 0);
    264            leftTertiary = leftLower32 & tertiaryMask;
    265        } while(leftTertiary == 0);
    266 
    267        uint32_t rightLower32, rightTertiary;
    268        do {
    269            rightLower32 = static_cast<uint32_t>(right.getCE(rightIndex++));
    270            anyQuaternaries |= rightLower32;
    271            U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
    272                     (rightLower32 & 0xc0c0) == 0);
    273            rightTertiary = rightLower32 & tertiaryMask;
    274        } while(rightTertiary == 0);
    275 
    276        if(leftTertiary != rightTertiary) {
    277            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
    278                // Pass through NO_CE and keep real tertiary weights larger than that.
    279                // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
    280                // to keep tertiary CEs well-formed.
    281                // Their case+tertiary weights must be greater than those of
    282                // primary and secondary CEs.
    283                if(leftTertiary > Collation::NO_CE_WEIGHT16) {
    284                    if(leftLower32 > 0xffff) {
    285                        leftTertiary ^= 0xc000;
    286                    } else {
    287                        leftTertiary += 0x4000;
    288                    }
    289                }
    290                if(rightTertiary > Collation::NO_CE_WEIGHT16) {
    291                    if(rightLower32 > 0xffff) {
    292                        rightTertiary ^= 0xc000;
    293                    } else {
    294                        rightTertiary += 0x4000;
    295                    }
    296                }
    297            }
    298            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
    299        }
    300        if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
    301    }
    302    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
    303 
    304    if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
    305        // If there are no "variable" CEs and no non-zero quaternary weights,
    306        // then there are no quaternary differences.
    307        return UCOL_EQUAL;
    308    }
    309 
    310    leftIndex = 0;
    311    rightIndex = 0;
    312    for(;;) {
    313        uint32_t leftQuaternary;
    314        do {
    315            int64_t ce = left.getCE(leftIndex++);
    316            leftQuaternary = static_cast<uint32_t>(ce) & 0xffff;
    317            if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
    318                // Variable primary or completely ignorable or NO_CE.
    319                leftQuaternary = static_cast<uint32_t>(ce >> 32);
    320            } else {
    321                // Regular CE, not tertiary ignorable.
    322                // Preserve the quaternary weight in bits 7..6.
    323                leftQuaternary |= 0xffffff3f;
    324            }
    325        } while(leftQuaternary == 0);
    326 
    327        uint32_t rightQuaternary;
    328        do {
    329            int64_t ce = right.getCE(rightIndex++);
    330            rightQuaternary = static_cast<uint32_t>(ce) & 0xffff;
    331            if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
    332                // Variable primary or completely ignorable or NO_CE.
    333                rightQuaternary = static_cast<uint32_t>(ce >> 32);
    334            } else {
    335                // Regular CE, not tertiary ignorable.
    336                // Preserve the quaternary weight in bits 7..6.
    337                rightQuaternary |= 0xffffff3f;
    338            }
    339        } while(rightQuaternary == 0);
    340 
    341        if(leftQuaternary != rightQuaternary) {
    342            // Return the difference, with script reordering.
    343            if(settings.hasReordering()) {
    344                leftQuaternary = settings.reorder(leftQuaternary);
    345                rightQuaternary = settings.reorder(rightQuaternary);
    346            }
    347            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
    348        }
    349        if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
    350    }
    351    return UCOL_EQUAL;
    352 }
    353 
    354 U_NAMESPACE_END
    355 
    356 #endif  // !UCONFIG_NO_COLLATION