tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucharstriebuilder.cpp (14220B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ucharstriebuilder.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010nov14
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/ucharstrie.h"
     19 #include "unicode/ucharstriebuilder.h"
     20 #include "unicode/unistr.h"
     21 #include "unicode/ustring.h"
     22 #include "cmemory.h"
     23 #include "uarrsort.h"
     24 #include "uassert.h"
     25 #include "uhash.h"
     26 #include "ustr_imp.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31 * Note: This builder implementation stores (string, value) pairs with full copies
     32 * of the 16-bit-unit sequences, until the UCharsTrie is built.
     33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     34 */
     35 
     36 class UCharsTrieElement : public UMemory {
     37 public:
     38    // Use compiler's default constructor, initializes nothing.
     39 
     40    void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
     41 
     42    UnicodeString getString(const UnicodeString &strings) const {
     43        int32_t length=strings[stringOffset];
     44        return strings.tempSubString(stringOffset+1, length);
     45    }
     46    int32_t getStringLength(const UnicodeString &strings) const {
     47        return strings[stringOffset];
     48    }
     49 
     50    char16_t charAt(int32_t index, const UnicodeString &strings) const {
     51        return strings[stringOffset+1+index];
     52    }
     53 
     54    int32_t getValue() const { return value; }
     55 
     56    int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
     57 
     58 private:
     59    // The first strings unit contains the string length.
     60    // (Compared with a stringLength field here, this saves 2 bytes per string.)
     61    int32_t stringOffset;
     62    int32_t value;
     63 };
     64 
     65 void
     66 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
     67                         UnicodeString &strings, UErrorCode &errorCode) {
     68    if(U_FAILURE(errorCode)) {
     69        return;
     70    }
     71    int32_t length=s.length();
     72    if(length>0xffff) {
     73        // Too long: We store the length in 1 unit.
     74        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
     75        return;
     76    }
     77    stringOffset=strings.length();
     78    strings.append(static_cast<char16_t>(length));
     79    value=val;
     80    strings.append(s);
     81 }
     82 
     83 int32_t
     84 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
     85    return getString(strings).compare(other.getString(strings));
     86 }
     87 
     88 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
     89        : elements(nullptr), elementsCapacity(0), elementsLength(0),
     90          uchars(nullptr), ucharsCapacity(0), ucharsLength(0) {}
     91 
     92 UCharsTrieBuilder::~UCharsTrieBuilder() {
     93    delete[] elements;
     94    uprv_free(uchars);
     95 }
     96 
     97 UCharsTrieBuilder &
     98 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
     99    if(U_FAILURE(errorCode)) {
    100        return *this;
    101    }
    102    if(ucharsLength>0) {
    103        // Cannot add elements after building.
    104        errorCode=U_NO_WRITE_PERMISSION;
    105        return *this;
    106    }
    107    if(elementsLength==elementsCapacity) {
    108        int32_t newCapacity;
    109        if(elementsCapacity==0) {
    110            newCapacity=1024;
    111        } else {
    112            newCapacity=4*elementsCapacity;
    113        }
    114        UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
    115        if(newElements==nullptr) {
    116            errorCode=U_MEMORY_ALLOCATION_ERROR;
    117            return *this;
    118        }
    119        if(elementsLength>0) {
    120            uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
    121        }
    122        delete[] elements;
    123        elements=newElements;
    124        elementsCapacity=newCapacity;
    125    }
    126    elements[elementsLength++].setTo(s, value, strings, errorCode);
    127    if(U_SUCCESS(errorCode) && strings.isBogus()) {
    128        errorCode=U_MEMORY_ALLOCATION_ERROR;
    129    }
    130    return *this;
    131 }
    132 
    133 U_CDECL_BEGIN
    134 
    135 static int32_t U_CALLCONV
    136 compareElementStrings(const void *context, const void *left, const void *right) {
    137    const UnicodeString *strings=static_cast<const UnicodeString *>(context);
    138    const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
    139    const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
    140    return leftElement->compareStringTo(*rightElement, *strings);
    141 }
    142 
    143 U_CDECL_END
    144 
    145 UCharsTrie *
    146 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    147    buildUChars(buildOption, errorCode);
    148    UCharsTrie *newTrie=nullptr;
    149    if(U_SUCCESS(errorCode)) {
    150        newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
    151        if(newTrie==nullptr) {
    152            errorCode=U_MEMORY_ALLOCATION_ERROR;
    153        } else {
    154            uchars=nullptr;  // The new trie now owns the array.
    155            ucharsCapacity=0;
    156        }
    157    }
    158    return newTrie;
    159 }
    160 
    161 UnicodeString &
    162 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
    163                                      UErrorCode &errorCode) {
    164    buildUChars(buildOption, errorCode);
    165    if(U_SUCCESS(errorCode)) {
    166        result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    167    }
    168    return result;
    169 }
    170 
    171 void
    172 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    173    if(U_FAILURE(errorCode)) {
    174        return;
    175    }
    176    if(uchars!=nullptr && ucharsLength>0) {
    177        // Already built.
    178        return;
    179    }
    180    if(ucharsLength==0) {
    181        if(elementsLength==0) {
    182            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    183            return;
    184        }
    185        if(strings.isBogus()) {
    186            errorCode=U_MEMORY_ALLOCATION_ERROR;
    187            return;
    188        }
    189        uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(UCharsTrieElement)),
    190                      compareElementStrings, &strings,
    191                      false,  // need not be a stable sort
    192                      &errorCode);
    193        if(U_FAILURE(errorCode)) {
    194            return;
    195        }
    196        // Duplicate strings are not allowed.
    197        UnicodeString prev=elements[0].getString(strings);
    198        for(int32_t i=1; i<elementsLength; ++i) {
    199            UnicodeString current=elements[i].getString(strings);
    200            if(prev==current) {
    201                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    202                return;
    203            }
    204            prev.fastCopyFrom(current);
    205        }
    206    }
    207    // Create and char16_t-serialize the trie for the elements.
    208    ucharsLength=0;
    209    int32_t capacity=strings.length();
    210    if(capacity<1024) {
    211        capacity=1024;
    212    }
    213    if(ucharsCapacity<capacity) {
    214        uprv_free(uchars);
    215        uchars=static_cast<char16_t *>(uprv_malloc(capacity*2));
    216        if(uchars==nullptr) {
    217            errorCode=U_MEMORY_ALLOCATION_ERROR;
    218            ucharsCapacity=0;
    219            return;
    220        }
    221        ucharsCapacity=capacity;
    222    }
    223    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    224    if(uchars==nullptr) {
    225        errorCode=U_MEMORY_ALLOCATION_ERROR;
    226    }
    227 }
    228 
    229 int32_t
    230 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
    231    return elements[i].getStringLength(strings);
    232 }
    233 
    234 char16_t
    235 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
    236    return elements[i].charAt(unitIndex, strings);
    237 }
    238 
    239 int32_t
    240 UCharsTrieBuilder::getElementValue(int32_t i) const {
    241    return elements[i].getValue();
    242 }
    243 
    244 int32_t
    245 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
    246    const UCharsTrieElement &firstElement=elements[first];
    247    const UCharsTrieElement &lastElement=elements[last];
    248    int32_t minStringLength=firstElement.getStringLength(strings);
    249    while(++unitIndex<minStringLength &&
    250            firstElement.charAt(unitIndex, strings)==
    251            lastElement.charAt(unitIndex, strings)) {}
    252    return unitIndex;
    253 }
    254 
    255 int32_t
    256 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
    257    int32_t length=0;  // Number of different units at unitIndex.
    258    int32_t i=start;
    259    do {
    260        char16_t unit=elements[i++].charAt(unitIndex, strings);
    261        while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
    262            ++i;
    263        }
    264        ++length;
    265    } while(i<limit);
    266    return length;
    267 }
    268 
    269 int32_t
    270 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
    271    do {
    272        char16_t unit=elements[i++].charAt(unitIndex, strings);
    273        while(unit==elements[i].charAt(unitIndex, strings)) {
    274            ++i;
    275        }
    276    } while(--count>0);
    277    return i;
    278 }
    279 
    280 int32_t
    281 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const {
    282    while(unit==elements[i].charAt(unitIndex, strings)) {
    283        ++i;
    284    }
    285    return i;
    286 }
    287 
    288 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode)
    289        : LinearMatchNode(len, nextNode), s(units) {
    290    hash=hash*37u+ustr_hashUCharsN(units, len);
    291 }
    292 
    293 bool
    294 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
    295    if(this==&other) {
    296        return true;
    297    }
    298    if(!LinearMatchNode::operator==(other)) {
    299        return false;
    300    }
    301    const UCTLinearMatchNode &o=static_cast<const UCTLinearMatchNode &>(other);
    302    return 0==u_memcmp(s, o.s, length);
    303 }
    304 
    305 void
    306 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
    307    UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
    308    next->write(builder);
    309    b.write(s, length);
    310    offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
    311 }
    312 
    313 StringTrieBuilder::Node *
    314 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    315                                         Node *nextNode) const {
    316    return new UCTLinearMatchNode(
    317            elements[i].getString(strings).getBuffer()+unitIndex,
    318            length,
    319            nextNode);
    320 }
    321 
    322 UBool
    323 UCharsTrieBuilder::ensureCapacity(int32_t length) {
    324    if(uchars==nullptr) {
    325        return false;  // previous memory allocation had failed
    326    }
    327    if(length>ucharsCapacity) {
    328        int32_t newCapacity=ucharsCapacity;
    329        do {
    330            newCapacity*=2;
    331        } while(newCapacity<=length);
    332        char16_t *newUChars=static_cast<char16_t *>(uprv_malloc(newCapacity*2));
    333        if(newUChars==nullptr) {
    334            // unable to allocate memory
    335            uprv_free(uchars);
    336            uchars=nullptr;
    337            ucharsCapacity=0;
    338            return false;
    339        }
    340        u_memcpy(newUChars+(newCapacity-ucharsLength),
    341                 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
    342        uprv_free(uchars);
    343        uchars=newUChars;
    344        ucharsCapacity=newCapacity;
    345    }
    346    return true;
    347 }
    348 
    349 int32_t
    350 UCharsTrieBuilder::write(int32_t unit) {
    351    int32_t newLength=ucharsLength+1;
    352    if(ensureCapacity(newLength)) {
    353        ucharsLength=newLength;
    354        uchars[ucharsCapacity - ucharsLength] = static_cast<char16_t>(unit);
    355    }
    356    return ucharsLength;
    357 }
    358 
    359 int32_t
    360 UCharsTrieBuilder::write(const char16_t *s, int32_t length) {
    361    int32_t newLength=ucharsLength+length;
    362    if(ensureCapacity(newLength)) {
    363        ucharsLength=newLength;
    364        u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
    365    }
    366    return ucharsLength;
    367 }
    368 
    369 int32_t
    370 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
    371    return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
    372 }
    373 
    374 int32_t
    375 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    376    if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
    377        return write(i|(isFinal<<15));
    378    }
    379    char16_t intUnits[3];
    380    int32_t length;
    381    if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
    382        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitValueLead);
    383        intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(i) >> 16);
    384        intUnits[2] = static_cast<char16_t>(i);
    385        length=3;
    386    // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
    387    //     intUnits[0]=(char16_t)(i);
    388    //     length=1;
    389    } else {
    390        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitValueLead + (i >> 16));
    391        intUnits[1] = static_cast<char16_t>(i);
    392        length=2;
    393    }
    394    intUnits[0] = static_cast<char16_t>(intUnits[0] | (isFinal << 15));
    395    return write(intUnits, length);
    396 }
    397 
    398 int32_t
    399 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    400    if(!hasValue) {
    401        return write(node);
    402    }
    403    char16_t intUnits[3];
    404    int32_t length;
    405    if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
    406        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitNodeValueLead);
    407        intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(value) >> 16);
    408        intUnits[2] = static_cast<char16_t>(value);
    409        length=3;
    410    } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
    411        intUnits[0] = static_cast<char16_t>((value + 1) << 6);
    412        length=1;
    413    } else {
    414        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitNodeValueLead + ((value >> 10) & 0x7fc0));
    415        intUnits[1] = static_cast<char16_t>(value);
    416        length=2;
    417    }
    418    intUnits[0] |= static_cast<char16_t>(node);
    419    return write(intUnits, length);
    420 }
    421 
    422 int32_t
    423 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    424    int32_t i=ucharsLength-jumpTarget;
    425    U_ASSERT(i>=0);
    426    if(i<=UCharsTrie::kMaxOneUnitDelta) {
    427        return write(i);
    428    }
    429    char16_t intUnits[3];
    430    int32_t length;
    431    if(i<=UCharsTrie::kMaxTwoUnitDelta) {
    432        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitDeltaLead + (i >> 16));
    433        length=1;
    434    } else {
    435        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitDeltaLead);
    436        intUnits[1] = static_cast<char16_t>(i >> 16);
    437        length=2;
    438    }
    439    intUnits[length++] = static_cast<char16_t>(i);
    440    return write(intUnits, length);
    441 }
    442 
    443 U_NAMESPACE_END