tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

coleitr.cpp (15790B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 1996-2014, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 /*
     11 * File coleitr.cpp
     12 *
     13 * Created by: Helena Shih
     14 *
     15 * Modification History:
     16 *
     17 *  Date      Name        Description
     18 *
     19 *  6/23/97   helena      Adding comments to make code more readable.
     20 * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
     21 * 12/10/99   aliu        Ported Thai collation support from Java.
     22 * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
     23 * 02/19/01   swquek      Removed CollationElementIterator() since it is 
     24 *                        private constructor and no calls are made to it
     25 * 2012-2014  markus      Rewritten in C++ again.
     26 */
     27 
     28 #include "unicode/utypes.h"
     29 
     30 #if !UCONFIG_NO_COLLATION
     31 
     32 #include "unicode/chariter.h"
     33 #include "unicode/coleitr.h"
     34 #include "unicode/tblcoll.h"
     35 #include "unicode/ustring.h"
     36 #include "cmemory.h"
     37 #include "collation.h"
     38 #include "collationdata.h"
     39 #include "collationiterator.h"
     40 #include "collationsets.h"
     41 #include "collationtailoring.h"
     42 #include "uassert.h"
     43 #include "uhash.h"
     44 #include "utf16collationiterator.h"
     45 #include "uvectr32.h"
     46 
     47 /* Constants --------------------------------------------------------------- */
     48 
     49 U_NAMESPACE_BEGIN
     50 
     51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
     52 
     53 /* CollationElementIterator public constructor/destructor ------------------ */
     54 
     55 CollationElementIterator::CollationElementIterator(
     56                                         const CollationElementIterator& other) 
     57        : UObject(other), iter_(nullptr), rbc_(nullptr), otherHalf_(0), dir_(0), offsets_(nullptr) {
     58    *this = other;
     59 }
     60 
     61 CollationElementIterator::~CollationElementIterator()
     62 {
     63    delete iter_;
     64    delete offsets_;
     65 }
     66 
     67 /* CollationElementIterator public methods --------------------------------- */
     68 
     69 namespace {
     70 
     71 uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
     72    return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
     73 }
     74 uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
     75    return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
     76 }
     77 UBool ceNeedsTwoParts(int64_t ce) {
     78    return (ce & INT64_C(0xffff00ff003f)) != 0;
     79 }
     80 
     81 }  // namespace
     82 
     83 int32_t CollationElementIterator::getOffset() const
     84 {
     85    if (dir_ < 0 && offsets_ != nullptr && !offsets_->isEmpty()) {
     86        // CollationIterator::previousCE() decrements the CEs length
     87        // while it pops CEs from its internal buffer.
     88        int32_t i = iter_->getCEsLength();
     89        if (otherHalf_ != 0) {
     90            // Return the trailing CE offset while we are in the middle of a 64-bit CE.
     91            ++i;
     92        }
     93        U_ASSERT(i < offsets_->size());
     94        return offsets_->elementAti(i);
     95    }
     96    return iter_->getOffset();
     97 }
     98 
     99 /**
    100 * Get the ordering priority of the next character in the string.
    101 * @return the next character's ordering. Returns NULLORDER if an error has 
    102 *         occurred or if the end of string has been reached
    103 */
    104 int32_t CollationElementIterator::next(UErrorCode& status)
    105 {
    106    if (U_FAILURE(status)) { return NULLORDER; }
    107    if (dir_ > 1) {
    108        // Continue forward iteration. Test this first.
    109        if (otherHalf_ != 0) {
    110            uint32_t oh = otherHalf_;
    111            otherHalf_ = 0;
    112            return oh;
    113        }
    114    } else if (dir_ == 1) {
    115        // next() after setOffset()
    116        dir_ = 2;
    117    } else if (dir_ == 0) {
    118        // The iter_ is already reset to the start of the text.
    119        dir_ = 2;
    120    } else /* dir_ < 0 */ {
    121        // illegal change of direction
    122        status = U_INVALID_STATE_ERROR;
    123        return NULLORDER;
    124    }
    125    // No need to keep all CEs in the buffer when we iterate.
    126    iter_->clearCEsIfNoneRemaining();
    127    int64_t ce = iter_->nextCE(status);
    128    if (ce == Collation::NO_CE) { return NULLORDER; }
    129    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    130    uint32_t p = static_cast<uint32_t>(ce >> 32);
    131    uint32_t lower32 = static_cast<uint32_t>(ce);
    132    uint32_t firstHalf = getFirstHalf(p, lower32);
    133    uint32_t secondHalf = getSecondHalf(p, lower32);
    134    if (secondHalf != 0) {
    135        otherHalf_ = secondHalf | 0xc0;  // continuation CE
    136    }
    137    return firstHalf;
    138 }
    139 
    140 bool CollationElementIterator::operator!=(
    141                                  const CollationElementIterator& other) const
    142 {
    143    return !(*this == other);
    144 }
    145 
    146 bool CollationElementIterator::operator==(
    147                                    const CollationElementIterator& that) const
    148 {
    149    if (this == &that) {
    150        return true;
    151    }
    152 
    153    return
    154        (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
    155        otherHalf_ == that.otherHalf_ &&
    156        normalizeDir() == that.normalizeDir() &&
    157        string_ == that.string_ &&
    158        *iter_ == *that.iter_;
    159 }
    160 
    161 /**
    162 * Get the ordering priority of the previous collation element in the string.
    163 * @param status the error code status.
    164 * @return the previous element's ordering. Returns NULLORDER if an error has 
    165 *         occurred or if the start of string has been reached.
    166 */
    167 int32_t CollationElementIterator::previous(UErrorCode& status)
    168 {
    169    if (U_FAILURE(status)) { return NULLORDER; }
    170    if (dir_ < 0) {
    171        // Continue backwards iteration. Test this first.
    172        if (otherHalf_ != 0) {
    173            uint32_t oh = otherHalf_;
    174            otherHalf_ = 0;
    175            return oh;
    176        }
    177    } else if (dir_ == 0) {
    178        iter_->resetToOffset(string_.length());
    179        dir_ = -1;
    180    } else if (dir_ == 1) {
    181        // previous() after setOffset()
    182        dir_ = -1;
    183    } else /* dir_ > 1 */ {
    184        // illegal change of direction
    185        status = U_INVALID_STATE_ERROR;
    186        return NULLORDER;
    187    }
    188    if (offsets_ == nullptr) {
    189        offsets_ = new UVector32(status);
    190        if (offsets_ == nullptr) {
    191            status = U_MEMORY_ALLOCATION_ERROR;
    192            return NULLORDER;
    193        }
    194    }
    195    // If we already have expansion CEs, then we also have offsets.
    196    // Otherwise remember the trailing offset in case we need to
    197    // write offsets for an artificial expansion.
    198    int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
    199    int64_t ce = iter_->previousCE(*offsets_, status);
    200    if (ce == Collation::NO_CE) { return NULLORDER; }
    201    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    202    uint32_t p = static_cast<uint32_t>(ce >> 32);
    203    uint32_t lower32 = static_cast<uint32_t>(ce);
    204    uint32_t firstHalf = getFirstHalf(p, lower32);
    205    uint32_t secondHalf = getSecondHalf(p, lower32);
    206    if (secondHalf != 0) {
    207        if (offsets_->isEmpty()) {
    208            // When we convert a single 64-bit CE into two 32-bit CEs,
    209            // we need to make this artificial expansion behave like a normal expansion.
    210            // See CollationIterator::previousCE().
    211            offsets_->addElement(iter_->getOffset(), status);
    212            offsets_->addElement(limitOffset, status);
    213        }
    214        otherHalf_ = firstHalf;
    215        return secondHalf | 0xc0;  // continuation CE
    216    }
    217    return firstHalf;
    218 }
    219 
    220 /**
    221 * Resets the cursor to the beginning of the string.
    222 */
    223 void CollationElementIterator::reset()
    224 {
    225    iter_ ->resetToOffset(0);
    226    otherHalf_ = 0;
    227    dir_ = 0;
    228 }
    229 
    230 void CollationElementIterator::setOffset(int32_t newOffset, 
    231                                         UErrorCode& status)
    232 {
    233    if (U_FAILURE(status)) { return; }
    234    if (0 < newOffset && newOffset < string_.length()) {
    235        int32_t offset = newOffset;
    236        do {
    237            char16_t c = string_.charAt(offset);
    238            if (!rbc_->isUnsafe(c) ||
    239                    (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
    240                break;
    241            }
    242            // Back up to before this unsafe character.
    243            --offset;
    244        } while (offset > 0);
    245        if (offset < newOffset) {
    246            // We might have backed up more than necessary.
    247            // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
    248            // but for text "chu" setOffset(2) should remain at 2
    249            // although we initially back up to offset 0.
    250            // Find the last safe offset no greater than newOffset by iterating forward.
    251            int32_t lastSafeOffset = offset;
    252            do {
    253                iter_->resetToOffset(lastSafeOffset);
    254                do {
    255                    iter_->nextCE(status);
    256                    if (U_FAILURE(status)) { return; }
    257                } while ((offset = iter_->getOffset()) == lastSafeOffset);
    258                if (offset <= newOffset) {
    259                    lastSafeOffset = offset;
    260                }
    261            } while (offset < newOffset);
    262            newOffset = lastSafeOffset;
    263        }
    264    }
    265    iter_->resetToOffset(newOffset);
    266    otherHalf_ = 0;
    267    dir_ = 1;
    268 }
    269 
    270 /**
    271 * Sets the source to the new source string.
    272 */
    273 void CollationElementIterator::setText(const UnicodeString& source,
    274                                       UErrorCode& status)
    275 {
    276    if (U_FAILURE(status)) {
    277        return;
    278    }
    279 
    280    string_ = source;
    281    const char16_t *s = string_.getBuffer();
    282    CollationIterator *newIter;
    283    UBool numeric = rbc_->settings->isNumeric();
    284    if (rbc_->settings->dontCheckFCD()) {
    285        newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    286    } else {
    287        newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    288    }
    289    if (newIter == nullptr) {
    290        status = U_MEMORY_ALLOCATION_ERROR;
    291        return;
    292    }
    293    delete iter_;
    294    iter_ = newIter;
    295    otherHalf_ = 0;
    296    dir_ = 0;
    297 }
    298 
    299 // Sets the source to the new character iterator.
    300 void CollationElementIterator::setText(CharacterIterator& source, 
    301                                       UErrorCode& status)
    302 {
    303    if (U_FAILURE(status)) 
    304        return;
    305 
    306    source.getText(string_);
    307    setText(string_, status);
    308 }
    309 
    310 int32_t CollationElementIterator::strengthOrder(int32_t order) const
    311 {
    312    UColAttributeValue s = static_cast<UColAttributeValue>(rbc_->settings->getStrength());
    313    // Mask off the unwanted differences.
    314    if (s == UCOL_PRIMARY) {
    315        order &= 0xffff0000;
    316    }
    317    else if (s == UCOL_SECONDARY) {
    318        order &= 0xffffff00;
    319    }
    320 
    321    return order;
    322 }
    323 
    324 /* CollationElementIterator private constructors/destructors --------------- */
    325 
    326 /** 
    327 * This is the "real" constructor for this class; it constructs an iterator
    328 * over the source text using the specified collator
    329 */
    330 CollationElementIterator::CollationElementIterator(
    331                                               const UnicodeString &source,
    332                                               const RuleBasedCollator *coll,
    333                                               UErrorCode &status)
    334        : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
    335    setText(source, status);
    336 }
    337 
    338 /** 
    339 * This is the "real" constructor for this class; it constructs an iterator over 
    340 * the source text using the specified collator
    341 */
    342 CollationElementIterator::CollationElementIterator(
    343                                           const CharacterIterator &source,
    344                                           const RuleBasedCollator *coll,
    345                                           UErrorCode &status)
    346        : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
    347    // We only call source.getText() which should be const anyway.
    348    setText(const_cast<CharacterIterator &>(source), status);
    349 }
    350 
    351 /* CollationElementIterator private methods -------------------------------- */
    352 
    353 const CollationElementIterator& CollationElementIterator::operator=(
    354                                         const CollationElementIterator& other)
    355 {
    356    if (this == &other) {
    357        return *this;
    358    }
    359 
    360    CollationIterator *newIter;
    361    const FCDUTF16CollationIterator *otherFCDIter =
    362            dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
    363    if(otherFCDIter != nullptr) {
    364        newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
    365    } else {
    366        const UTF16CollationIterator *otherIter =
    367                dynamic_cast<const UTF16CollationIterator *>(other.iter_);
    368        if(otherIter != nullptr) {
    369            newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
    370        } else {
    371            newIter = nullptr;
    372        }
    373    }
    374    if(newIter != nullptr) {
    375        delete iter_;
    376        iter_ = newIter;
    377        rbc_ = other.rbc_;
    378        otherHalf_ = other.otherHalf_;
    379        dir_ = other.dir_;
    380 
    381        string_ = other.string_;
    382    }
    383    if(other.dir_ < 0 && other.offsets_ != nullptr && !other.offsets_->isEmpty()) {
    384        UErrorCode errorCode = U_ZERO_ERROR;
    385        if(offsets_ == nullptr) {
    386            offsets_ = new UVector32(other.offsets_->size(), errorCode);
    387        }
    388        if(offsets_ != nullptr) {
    389            offsets_->assign(*other.offsets_, errorCode);
    390        }
    391    }
    392    return *this;
    393 }
    394 
    395 namespace {
    396 
    397 class MaxExpSink : public ContractionsAndExpansions::CESink {
    398 public:
    399    MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
    400    virtual ~MaxExpSink();
    401    virtual void handleCE(int64_t /*ce*/) override {}
    402    virtual void handleExpansion(const int64_t ces[], int32_t length) override {
    403        if (length <= 1) {
    404            // We do not need to add single CEs into the map.
    405            return;
    406        }
    407        int32_t count = 0;  // number of CE "halves"
    408        for (int32_t i = 0; i < length; ++i) {
    409            count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
    410        }
    411        // last "half" of the last CE
    412        int64_t ce = ces[length - 1];
    413        uint32_t p = static_cast<uint32_t>(ce >> 32);
    414        uint32_t lower32 = static_cast<uint32_t>(ce);
    415        uint32_t lastHalf = getSecondHalf(p, lower32);
    416        if (lastHalf == 0) {
    417            lastHalf = getFirstHalf(p, lower32);
    418            U_ASSERT(lastHalf != 0);
    419        } else {
    420            lastHalf |= 0xc0;  // old-style continuation CE
    421        }
    422        if (count > uhash_igeti(maxExpansions, static_cast<int32_t>(lastHalf))) {
    423            uhash_iputi(maxExpansions, static_cast<int32_t>(lastHalf), count, &errorCode);
    424        }
    425    }
    426 
    427 private:
    428    UHashtable *maxExpansions;
    429    UErrorCode &errorCode;
    430 };
    431 
    432 MaxExpSink::~MaxExpSink() {}
    433 
    434 }  // namespace
    435 
    436 UHashtable *
    437 CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
    438    if (U_FAILURE(errorCode)) { return nullptr; }
    439    UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
    440                                           uhash_compareLong, &errorCode);
    441    if (U_FAILURE(errorCode)) { return nullptr; }
    442    MaxExpSink sink(maxExpansions, errorCode);
    443    ContractionsAndExpansions(nullptr, nullptr, &sink, true).forData(data, errorCode);
    444    if (U_FAILURE(errorCode)) {
    445        uhash_close(maxExpansions);
    446        return nullptr;
    447    }
    448    return maxExpansions;
    449 }
    450 
    451 int32_t
    452 CollationElementIterator::getMaxExpansion(int32_t order) const {
    453    return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
    454 }
    455 
    456 int32_t
    457 CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
    458    if (order == 0) { return 1; }
    459    int32_t max;
    460    if(maxExpansions != nullptr && (max = uhash_igeti(maxExpansions, order)) != 0) {
    461        return max;
    462    }
    463    if ((order & 0xc0) == 0xc0) {
    464        // old-style continuation CE
    465        return 2;
    466    } else {
    467        return 1;
    468    }
    469 }
    470 
    471 U_NAMESPACE_END
    472 
    473 #endif /* #if !UCONFIG_NO_COLLATION */