tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

bytestriebuilder.cpp (16435B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  bytestriebuilder.cpp
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010sep25
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/bytestrie.h"
     19 #include "unicode/bytestriebuilder.h"
     20 #include "unicode/stringpiece.h"
     21 #include "charstr.h"
     22 #include "cmemory.h"
     23 #include "uhash.h"
     24 #include "uarrsort.h"
     25 #include "uassert.h"
     26 #include "ustr_imp.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31 * Note: This builder implementation stores (bytes, value) pairs with full copies
     32 * of the byte sequences, until the BytesTrie is built.
     33 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
     34 */
     35 
     36 class BytesTrieElement : public UMemory {
     37 public:
     38    // Use compiler's default constructor, initializes nothing.
     39 
     40    void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
     41 
     42    StringPiece getString(const CharString &strings) const {
     43        int32_t offset=stringOffset;
     44        int32_t length;
     45        if(offset>=0) {
     46            length = static_cast<uint8_t>(strings[offset++]);
     47        } else {
     48            offset=~offset;
     49            length = (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
     50            offset+=2;
     51        }
     52        return StringPiece(strings.data()+offset, length);
     53    }
     54    int32_t getStringLength(const CharString &strings) const {
     55        int32_t offset=stringOffset;
     56        if(offset>=0) {
     57            return static_cast<uint8_t>(strings[offset]);
     58        } else {
     59            offset=~offset;
     60            return (static_cast<int32_t>(static_cast<uint8_t>(strings[offset])) << 8) | static_cast<uint8_t>(strings[offset + 1]);
     61        }
     62    }
     63 
     64    char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
     65 
     66    int32_t getValue() const { return value; }
     67 
     68    int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
     69 
     70 private:
     71    const char *data(const CharString &strings) const {
     72        int32_t offset=stringOffset;
     73        if(offset>=0) {
     74            ++offset;
     75        } else {
     76            offset=~offset+2;
     77        }
     78        return strings.data()+offset;
     79    }
     80 
     81    // If the stringOffset is non-negative, then the first strings byte contains
     82    // the string length.
     83    // If the stringOffset is negative, then the first two strings bytes contain
     84    // the string length (big-endian), and the offset needs to be bit-inverted.
     85    // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
     86    int32_t stringOffset;
     87    int32_t value;
     88 };
     89 
     90 void
     91 BytesTrieElement::setTo(StringPiece s, int32_t val,
     92                        CharString &strings, UErrorCode &errorCode) {
     93    if(U_FAILURE(errorCode)) {
     94        return;
     95    }
     96    int32_t length=s.length();
     97    if(length>0xffff) {
     98        // Too long: We store the length in 1 or 2 bytes.
     99        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    100        return;
    101    }
    102    int32_t offset=strings.length();
    103    if(length>0xff) {
    104        offset=~offset;
    105        strings.append(static_cast<char>(length >> 8), errorCode);
    106    }
    107    strings.append(static_cast<char>(length), errorCode);
    108    stringOffset=offset;
    109    value=val;
    110    strings.append(s, errorCode);
    111 }
    112 
    113 int32_t
    114 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
    115    // TODO: add StringPiece::compare(), see ticket #8187
    116    StringPiece thisString=getString(strings);
    117    StringPiece otherString=other.getString(strings);
    118    int32_t lengthDiff=thisString.length()-otherString.length();
    119    int32_t commonLength;
    120    if(lengthDiff<=0) {
    121        commonLength=thisString.length();
    122    } else {
    123        commonLength=otherString.length();
    124    }
    125    int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
    126    return diff!=0 ? diff : lengthDiff;
    127 }
    128 
    129 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
    130        : strings(nullptr), elements(nullptr), elementsCapacity(0), elementsLength(0),
    131          bytes(nullptr), bytesCapacity(0), bytesLength(0) {
    132    if(U_FAILURE(errorCode)) {
    133        return;
    134    }
    135    strings=new CharString();
    136    if(strings==nullptr) {
    137        errorCode=U_MEMORY_ALLOCATION_ERROR;
    138    }
    139 }
    140 
    141 BytesTrieBuilder::~BytesTrieBuilder() {
    142    delete strings;
    143    delete[] elements;
    144    uprv_free(bytes);
    145 }
    146 
    147 BytesTrieBuilder &
    148 BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
    149    if(U_FAILURE(errorCode)) {
    150        return *this;
    151    }
    152    if(bytesLength>0) {
    153        // Cannot add elements after building.
    154        errorCode=U_NO_WRITE_PERMISSION;
    155        return *this;
    156    }
    157    if(elementsLength==elementsCapacity) {
    158        int32_t newCapacity;
    159        if(elementsCapacity==0) {
    160            newCapacity=1024;
    161        } else {
    162            newCapacity=4*elementsCapacity;
    163        }
    164        BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
    165        if(newElements==nullptr) {
    166            errorCode=U_MEMORY_ALLOCATION_ERROR;
    167            return *this; // error instead of dereferencing null
    168        }
    169        if(elementsLength>0) {
    170            uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
    171        }
    172        delete[] elements;
    173        elements=newElements;
    174        elementsCapacity=newCapacity;
    175    }
    176    elements[elementsLength++].setTo(s, value, *strings, errorCode);
    177    return *this;
    178 }
    179 
    180 U_CDECL_BEGIN
    181 
    182 static int32_t U_CALLCONV
    183 compareElementStrings(const void *context, const void *left, const void *right) {
    184    const CharString *strings=static_cast<const CharString *>(context);
    185    const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
    186    const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
    187    return leftElement->compareStringTo(*rightElement, *strings);
    188 }
    189 
    190 U_CDECL_END
    191 
    192 BytesTrie *
    193 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    194    buildBytes(buildOption, errorCode);
    195    BytesTrie *newTrie=nullptr;
    196    if(U_SUCCESS(errorCode)) {
    197        newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
    198        if(newTrie==nullptr) {
    199            errorCode=U_MEMORY_ALLOCATION_ERROR;
    200        } else {
    201            bytes=nullptr;  // The new trie now owns the array.
    202            bytesCapacity=0;
    203        }
    204    }
    205    return newTrie;
    206 }
    207 
    208 StringPiece
    209 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    210    buildBytes(buildOption, errorCode);
    211    StringPiece result;
    212    if(U_SUCCESS(errorCode)) {
    213        result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
    214    }
    215    return result;
    216 }
    217 
    218 void
    219 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
    220    if(U_FAILURE(errorCode)) {
    221        return;
    222    }
    223    if(bytes!=nullptr && bytesLength>0) {
    224        // Already built.
    225        return;
    226    }
    227    if(bytesLength==0) {
    228        if(elementsLength==0) {
    229            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    230            return;
    231        }
    232        uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(BytesTrieElement)),
    233                      compareElementStrings, strings,
    234                      false,  // need not be a stable sort
    235                      &errorCode);
    236        if(U_FAILURE(errorCode)) {
    237            return;
    238        }
    239        // Duplicate strings are not allowed.
    240        StringPiece prev=elements[0].getString(*strings);
    241        for(int32_t i=1; i<elementsLength; ++i) {
    242            StringPiece current=elements[i].getString(*strings);
    243            if(prev==current) {
    244                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    245                return;
    246            }
    247            prev=current;
    248        }
    249    }
    250    // Create and byte-serialize the trie for the elements.
    251    bytesLength=0;
    252    int32_t capacity=strings->length();
    253    if(capacity<1024) {
    254        capacity=1024;
    255    }
    256    if(bytesCapacity<capacity) {
    257        uprv_free(bytes);
    258        bytes=static_cast<char *>(uprv_malloc(capacity));
    259        if(bytes==nullptr) {
    260            errorCode=U_MEMORY_ALLOCATION_ERROR;
    261            bytesCapacity=0;
    262            return;
    263        }
    264        bytesCapacity=capacity;
    265    }
    266    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
    267    if(bytes==nullptr) {
    268        errorCode=U_MEMORY_ALLOCATION_ERROR;
    269    }
    270 }
    271 
    272 BytesTrieBuilder &
    273 BytesTrieBuilder::clear() {
    274    strings->clear();
    275    elementsLength=0;
    276    bytesLength=0;
    277    return *this;
    278 }
    279 
    280 int32_t
    281 BytesTrieBuilder::getElementStringLength(int32_t i) const {
    282    return elements[i].getStringLength(*strings);
    283 }
    284 
    285 char16_t
    286 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
    287    return static_cast<uint8_t>(elements[i].charAt(byteIndex, *strings));
    288 }
    289 
    290 int32_t
    291 BytesTrieBuilder::getElementValue(int32_t i) const {
    292    return elements[i].getValue();
    293 }
    294 
    295 int32_t
    296 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
    297    const BytesTrieElement &firstElement=elements[first];
    298    const BytesTrieElement &lastElement=elements[last];
    299    int32_t minStringLength=firstElement.getStringLength(*strings);
    300    while(++byteIndex<minStringLength &&
    301            firstElement.charAt(byteIndex, *strings)==
    302            lastElement.charAt(byteIndex, *strings)) {}
    303    return byteIndex;
    304 }
    305 
    306 int32_t
    307 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
    308    int32_t length=0;  // Number of different bytes at byteIndex.
    309    int32_t i=start;
    310    do {
    311        char byte=elements[i++].charAt(byteIndex, *strings);
    312        while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
    313            ++i;
    314        }
    315        ++length;
    316    } while(i<limit);
    317    return length;
    318 }
    319 
    320 int32_t
    321 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
    322    do {
    323        char byte=elements[i++].charAt(byteIndex, *strings);
    324        while(byte==elements[i].charAt(byteIndex, *strings)) {
    325            ++i;
    326        }
    327    } while(--count>0);
    328    return i;
    329 }
    330 
    331 int32_t
    332 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const {
    333    char b = static_cast<char>(byte);
    334    while(b==elements[i].charAt(byteIndex, *strings)) {
    335        ++i;
    336    }
    337    return i;
    338 }
    339 
    340 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
    341        : LinearMatchNode(len, nextNode), s(bytes) {
    342    hash=static_cast<int32_t>(
    343        static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
    344 }
    345 
    346 bool
    347 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
    348    if(this==&other) {
    349        return true;
    350    }
    351    if(!LinearMatchNode::operator==(other)) {
    352        return false;
    353    }
    354    const BTLinearMatchNode &o=static_cast<const BTLinearMatchNode &>(other);
    355    return 0==uprv_memcmp(s, o.s, length);
    356 }
    357 
    358 void
    359 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
    360    BytesTrieBuilder &b=static_cast<BytesTrieBuilder &>(builder);
    361    next->write(builder);
    362    b.write(s, length);
    363    offset=b.write(b.getMinLinearMatch()+length-1);
    364 }
    365 
    366 StringTrieBuilder::Node *
    367 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
    368                                        Node *nextNode) const {
    369    return new BTLinearMatchNode(
    370            elements[i].getString(*strings).data()+byteIndex,
    371            length,
    372            nextNode);
    373 }
    374 
    375 UBool
    376 BytesTrieBuilder::ensureCapacity(int32_t length) {
    377    if(bytes==nullptr) {
    378        return false;  // previous memory allocation had failed
    379    }
    380    if(length>bytesCapacity) {
    381        int32_t newCapacity=bytesCapacity;
    382        do {
    383            newCapacity*=2;
    384        } while(newCapacity<=length);
    385        char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
    386        if(newBytes==nullptr) {
    387            // unable to allocate memory
    388            uprv_free(bytes);
    389            bytes=nullptr;
    390            bytesCapacity=0;
    391            return false;
    392        }
    393        uprv_memcpy(newBytes+(newCapacity-bytesLength),
    394                    bytes+(bytesCapacity-bytesLength), bytesLength);
    395        uprv_free(bytes);
    396        bytes=newBytes;
    397        bytesCapacity=newCapacity;
    398    }
    399    return true;
    400 }
    401 
    402 int32_t
    403 BytesTrieBuilder::write(int32_t byte) {
    404    int32_t newLength=bytesLength+1;
    405    if(ensureCapacity(newLength)) {
    406        bytesLength=newLength;
    407        bytes[bytesCapacity - bytesLength] = static_cast<char>(byte);
    408    }
    409    return bytesLength;
    410 }
    411 
    412 int32_t
    413 BytesTrieBuilder::write(const char *b, int32_t length) {
    414    int32_t newLength=bytesLength+length;
    415    if(ensureCapacity(newLength)) {
    416        bytesLength=newLength;
    417        uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
    418    }
    419    return bytesLength;
    420 }
    421 
    422 int32_t
    423 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
    424    return write(elements[i].getString(*strings).data()+byteIndex, length);
    425 }
    426 
    427 int32_t
    428 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
    429    if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
    430        return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
    431    }
    432    char intBytes[5];
    433    int32_t length=1;
    434    if(i<0 || i>0xffffff) {
    435        intBytes[0] = static_cast<char>(BytesTrie::kFiveByteValueLead);
    436        intBytes[1] = static_cast<char>(static_cast<uint32_t>(i) >> 24);
    437        intBytes[2] = static_cast<char>(static_cast<uint32_t>(i) >> 16);
    438        intBytes[3] = static_cast<char>(static_cast<uint32_t>(i) >> 8);
    439        intBytes[4] = static_cast<char>(i);
    440        length=5;
    441    // } else if(i<=BytesTrie::kMaxOneByteValue) {
    442    //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
    443    } else {
    444        if(i<=BytesTrie::kMaxTwoByteValue) {
    445            intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteValueLead + (i >> 8));
    446        } else {
    447            if(i<=BytesTrie::kMaxThreeByteValue) {
    448                intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteValueLead + (i >> 16));
    449            } else {
    450                intBytes[0] = static_cast<char>(BytesTrie::kFourByteValueLead);
    451                intBytes[1] = static_cast<char>(i >> 16);
    452                length=2;
    453            }
    454            intBytes[length++] = static_cast<char>(i >> 8);
    455        }
    456        intBytes[length++] = static_cast<char>(i);
    457    }
    458    intBytes[0] = static_cast<char>((intBytes[0] << 1) | isFinal);
    459    return write(intBytes, length);
    460 }
    461 
    462 int32_t
    463 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
    464    int32_t offset=write(node);
    465    if(hasValue) {
    466        offset=writeValueAndFinal(value, false);
    467    }
    468    return offset;
    469 }
    470 
    471 int32_t
    472 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
    473    int32_t i=bytesLength-jumpTarget;
    474    U_ASSERT(i>=0);
    475    if(i<=BytesTrie::kMaxOneByteDelta) {
    476        return write(i);
    477    } else {
    478        char intBytes[5];
    479        return write(intBytes, internalEncodeDelta(i, intBytes));
    480    }
    481 }
    482 
    483 int32_t
    484 BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
    485    U_ASSERT(i>=0);
    486    if(i<=BytesTrie::kMaxOneByteDelta) {
    487        intBytes[0] = static_cast<char>(i);
    488        return 1;
    489    }
    490    int32_t length=1;
    491    if(i<=BytesTrie::kMaxTwoByteDelta) {
    492        intBytes[0] = static_cast<char>(BytesTrie::kMinTwoByteDeltaLead + (i >> 8));
    493    } else {
    494        if(i<=BytesTrie::kMaxThreeByteDelta) {
    495            intBytes[0] = static_cast<char>(BytesTrie::kMinThreeByteDeltaLead + (i >> 16));
    496        } else {
    497            if(i<=0xffffff) {
    498                intBytes[0] = static_cast<char>(BytesTrie::kFourByteDeltaLead);
    499            } else {
    500                intBytes[0] = static_cast<char>(BytesTrie::kFiveByteDeltaLead);
    501                intBytes[1] = static_cast<char>(i >> 24);
    502                length=2;
    503            }
    504            intBytes[length++] = static_cast<char>(i >> 16);
    505        }
    506        intBytes[length++] = static_cast<char>(i >> 8);
    507    }
    508    intBytes[length++] = static_cast<char>(i);
    509    return length;
    510 }
    511 
    512 U_NAMESPACE_END