tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

stringtriebuilder.h (15902B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2012,2014, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  stringtriebuilder.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010dec24
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #ifndef __STRINGTRIEBUILDER_H__
     18 #define __STRINGTRIEBUILDER_H__
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if U_SHOW_CPLUSPLUS_API
     23 
     24 #include "unicode/uobject.h"
     25 
     26 /**
     27 * \file
     28 * \brief C++ API: Builder API for trie builders
     29 */
     30 
     31 // Forward declaration.
     32 /// \cond
     33 struct UHashtable;
     34 typedef struct UHashtable UHashtable;
     35 /// \endcond
     36 
     37 /**
     38 * Build options for BytesTrieBuilder and CharsTrieBuilder.
     39 * @stable ICU 4.8
     40 */
     41 enum UStringTrieBuildOption {
     42    /**
     43     * Builds a trie quickly.
     44     * @stable ICU 4.8
     45     */
     46    USTRINGTRIE_BUILD_FAST,
     47    /**
     48     * Builds a trie more slowly, attempting to generate
     49     * a shorter but equivalent serialization.
     50     * This build option also uses more memory.
     51     *
     52     * This option can be effective when many integer values are the same
     53     * and string/byte sequence suffixes can be shared.
     54     * Runtime speed is not expected to improve.
     55     * @stable ICU 4.8
     56     */
     57    USTRINGTRIE_BUILD_SMALL
     58 };
     59 
     60 U_NAMESPACE_BEGIN
     61 
     62 /**
     63 * Base class for string trie builder classes.
     64 *
     65 * This class is not intended for public subclassing.
     66 * @stable ICU 4.8
     67 */
     68 class U_COMMON_API StringTrieBuilder : public UObject {
     69 public:
     70 #ifndef U_HIDE_INTERNAL_API
     71    /** @internal */
     72    static int32_t hashNode(const void *node);
     73    /** @internal */
     74    static UBool equalNodes(const void *left, const void *right);
     75 #endif  /* U_HIDE_INTERNAL_API */
     76 
     77 protected:
     78    // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
     79    // or else the compiler will create a public default constructor.
     80    /** @internal */
     81    StringTrieBuilder();
     82    /** @internal */
     83    virtual ~StringTrieBuilder();
     84 
     85 #ifndef U_HIDE_INTERNAL_API
     86    /** @internal */
     87    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
     88    /** @internal */
     89    void deleteCompactBuilder();
     90 
     91    /** @internal */
     92    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
     93 
     94    /** @internal */
     95    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
     96    /** @internal */
     97    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
     98 #endif  /* U_HIDE_INTERNAL_API */
     99 
    100    class Node;
    101 
    102 #ifndef U_HIDE_INTERNAL_API
    103    /** @internal */
    104    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
    105    /** @internal */
    106    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
    107                            int32_t length, UErrorCode &errorCode);
    108 #endif  /* U_HIDE_INTERNAL_API */
    109 
    110    /** @internal */
    111    virtual int32_t getElementStringLength(int32_t i) const = 0;
    112    /** @internal */
    113    virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
    114    /** @internal */
    115    virtual int32_t getElementValue(int32_t i) const = 0;
    116 
    117    // Finds the first unit index after this one where
    118    // the first and last element have different units again.
    119    /** @internal */
    120    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
    121 
    122    // Number of different units at unitIndex.
    123    /** @internal */
    124    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
    125    /** @internal */
    126    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
    127    /** @internal */
    128    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
    129 
    130    /** @internal */
    131    virtual UBool matchNodesCanHaveValues() const = 0;
    132 
    133    /** @internal */
    134    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
    135    /** @internal */
    136    virtual int32_t getMinLinearMatch() const = 0;
    137    /** @internal */
    138    virtual int32_t getMaxLinearMatchLength() const = 0;
    139 
    140 #ifndef U_HIDE_INTERNAL_API
    141    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
    142    /** @internal */
    143    static const int32_t kMaxBranchLinearSubNodeLength=5;
    144 
    145    // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
    146    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
    147    /** @internal */
    148    static const int32_t kMaxSplitBranchLevels=14;
    149 
    150    /**
    151     * Makes sure that there is only one unique node registered that is
    152     * equivalent to newNode.
    153     * @param newNode Input node. The builder takes ownership.
    154     * @param errorCode ICU in/out UErrorCode.
    155                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
    156     * @return newNode if it is the first of its kind, or
    157     *         an equivalent node if newNode is a duplicate.
    158     * @internal
    159     */
    160    Node *registerNode(Node *newNode, UErrorCode &errorCode);
    161    /**
    162     * Makes sure that there is only one unique FinalValueNode registered
    163     * with this value.
    164     * Avoids creating a node if the value is a duplicate.
    165     * @param value A final value.
    166     * @param errorCode ICU in/out UErrorCode.
    167                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
    168     * @return A FinalValueNode with the given value.
    169     * @internal
    170     */
    171    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
    172 #endif  /* U_HIDE_INTERNAL_API */
    173 
    174    /*
    175     * C++ note:
    176     * registerNode() and registerFinalValue() take ownership of their input nodes,
    177     * and only return owned nodes.
    178     * If they see a failure UErrorCode, they will delete the input node.
    179     * If they get a nullptr pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
    180     * If there is a failure, they return nullptr.
    181     *
    182     * nullptr Node pointers can be safely passed into other Nodes because
    183     * they call the static Node::hashCode() which checks for a nullptr pointer first.
    184     *
    185     * Therefore, as long as builder functions register a new node,
    186     * they need to check for failures only before explicitly dereferencing
    187     * a Node pointer, or before setting a new UErrorCode.
    188     */
    189 
    190    // Hash set of nodes, maps from nodes to integer 1.
    191    /** @internal */
    192    UHashtable *nodes;
    193 
    194    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    195    // it is needed for layout of other objects.
    196    /**
    197     * @internal
    198     * \cond
    199     */
    200    class Node : public UObject {
    201    public:
    202        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
    203        inline int32_t hashCode() const { return hash; }
    204        // Handles node==nullptr.
    205        static inline int32_t hashCode(const Node *node) { return node==nullptr ? 0 : node->hashCode(); }
    206        // Base class operator==() compares the actual class types.
    207        virtual bool operator==(const Node &other) const;
    208        inline bool operator!=(const Node &other) const { return !operator==(other); }
    209        /**
    210         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
    211         * This is to avoid writing a duplicate node twice.
    212         *
    213         * Branch nodes in this trie data structure are not symmetric.
    214         * Most branch edges "jump" to other nodes but the rightmost branch edges
    215         * just continue without a jump.
    216         * Therefore, write() must write the rightmost branch edge last
    217         * (trie units are written backwards), and must write it at that point even if
    218         * it is a duplicate of a node previously written elsewhere.
    219         *
    220         * This function visits and marks right branch edges first.
    221         * Edges are numbered with increasingly negative values because we share the
    222         * offset field which gets positive values when nodes are written.
    223         * A branch edge also remembers the first number for any of its edges.
    224         *
    225         * When a further-left branch edge has a number in the range of the rightmost
    226         * edge's numbers, then it will be written as part of the required right edge
    227         * and we can avoid writing it first.
    228         *
    229         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
    230         * edge numbers.
    231         *
    232         * @param edgeNumber The first edge number for this node and its sub-nodes.
    233         * @return An edge number that is at least the maximum-negative
    234         *         of the input edge number and the numbers of this node and all of its sub-nodes.
    235         */
    236        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
    237        // write() must set the offset to a positive value.
    238        virtual void write(StringTrieBuilder &builder) = 0;
    239        // See markRightEdgesFirst.
    240        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
    241                                               StringTrieBuilder &builder) {
    242            // Note: Edge numbers are negative, lastRight<=firstRight.
    243            // If offset>0 then this node and its sub-nodes have been written already
    244            // and we need not write them again.
    245            // If this node is part of the unwritten right branch edge,
    246            // then we wait until that is written.
    247            if(offset<0 && (offset<lastRight || firstRight<offset)) {
    248                write(builder);
    249            }
    250        }
    251        inline int32_t getOffset() const { return offset; }
    252    protected:
    253        int32_t hash;
    254        int32_t offset;
    255    };
    256 
    257 #ifndef U_HIDE_INTERNAL_API
    258    // This class should not be overridden because
    259    // registerFinalValue() compares a stack-allocated FinalValueNode
    260    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
    261    // with the input node, and the
    262    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
    263    // will be false if the typeid's are different.
    264    /** @internal */
    265    class FinalValueNode : public Node {
    266    public:
    267        FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
    268        virtual bool operator==(const Node &other) const override;
    269        virtual void write(StringTrieBuilder &builder) override;
    270    protected:
    271        int32_t value;
    272    };
    273 #endif  /* U_HIDE_INTERNAL_API */
    274 
    275    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    276    // it is needed for layout of other objects.
    277    /**
    278     * @internal 
    279     */
    280    class ValueNode : public Node {
    281    public:
    282        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {}
    283        virtual bool operator==(const Node &other) const override;
    284        void setValue(int32_t v) {
    285            hasValue=true;
    286            value=v;
    287            hash=hash*37u+v;
    288        }
    289    protected:
    290        UBool hasValue;
    291        int32_t value;
    292    };
    293 
    294 #ifndef U_HIDE_INTERNAL_API
    295    /** 
    296     * @internal 
    297     */
    298    class IntermediateValueNode : public ValueNode {
    299    public:
    300        IntermediateValueNode(int32_t v, Node *nextNode)
    301                : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
    302        virtual bool operator==(const Node &other) const override;
    303        virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
    304        virtual void write(StringTrieBuilder &builder) override;
    305    protected:
    306        Node *next;
    307    };
    308 #endif  /* U_HIDE_INTERNAL_API */
    309 
    310    // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
    311    // it is needed for layout of other objects.
    312    /**
    313     * @internal 
    314     */
    315    class LinearMatchNode : public ValueNode {
    316    public:
    317        LinearMatchNode(int32_t len, Node *nextNode)
    318                : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
    319                  length(len), next(nextNode) {}
    320        virtual bool operator==(const Node &other) const override;
    321        virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
    322    protected:
    323        int32_t length;
    324        Node *next;
    325    };
    326 
    327 #ifndef U_HIDE_INTERNAL_API
    328    /**
    329     * @internal 
    330     */
    331    class BranchNode : public Node {
    332    public:
    333        BranchNode(int32_t initialHash) : Node(initialHash) {}
    334    protected:
    335        int32_t firstEdgeNumber;
    336    };
    337 
    338    /**
    339     * @internal 
    340     */
    341    class ListBranchNode : public BranchNode {
    342    public:
    343        ListBranchNode() : BranchNode(0x444444), length(0) {}
    344        virtual bool operator==(const Node &other) const override;
    345        virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
    346        virtual void write(StringTrieBuilder &builder) override;
    347        // Adds a unit with a final value.
    348        void add(int32_t c, int32_t value) {
    349            units[length] = static_cast<char16_t>(c);
    350            equal[length]=nullptr;
    351            values[length]=value;
    352            ++length;
    353            hash=(hash*37u+c)*37u+value;
    354        }
    355        // Adds a unit which leads to another match node.
    356        void add(int32_t c, Node *node) {
    357            units[length] = static_cast<char16_t>(c);
    358            equal[length]=node;
    359            values[length]=0;
    360            ++length;
    361            hash=(hash*37u+c)*37u+hashCode(node);
    362        }
    363    protected:
    364        Node *equal[kMaxBranchLinearSubNodeLength];  // nullptr means "has final value".
    365        int32_t length;
    366        int32_t values[kMaxBranchLinearSubNodeLength];
    367        char16_t units[kMaxBranchLinearSubNodeLength];
    368    };
    369 
    370    /**
    371     * @internal 
    372     */
    373    class SplitBranchNode : public BranchNode {
    374    public:
    375        SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
    376                : BranchNode(((0x555555u*37u+middleUnit)*37u+
    377                              hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
    378                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
    379        virtual bool operator==(const Node &other) const override;
    380        virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
    381        virtual void write(StringTrieBuilder &builder) override;
    382    protected:
    383        char16_t unit;
    384        Node *lessThan;
    385        Node *greaterOrEqual;
    386    };
    387 
    388    // Branch head node, for writing the actual node lead unit.
    389    /** @internal */
    390    class BranchHeadNode : public ValueNode {
    391    public:
    392        BranchHeadNode(int32_t len, Node *subNode)
    393                : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
    394                  length(len), next(subNode) {}
    395        virtual bool operator==(const Node &other) const override;
    396        virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
    397        virtual void write(StringTrieBuilder &builder) override;
    398    protected:
    399        int32_t length;
    400        Node *next;  // A branch sub-node.
    401    };
    402 
    403 #endif  /* U_HIDE_INTERNAL_API */
    404    /// \endcond
    405 
    406    /** @internal */
    407    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
    408                                        Node *nextNode) const = 0;
    409 
    410    /** @internal */
    411    virtual int32_t write(int32_t unit) = 0;
    412    /** @internal */
    413    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
    414    /** @internal */
    415    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
    416    /** @internal */
    417    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
    418    /** @internal */
    419    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
    420 };
    421 
    422 U_NAMESPACE_END
    423 
    424 #endif /* U_SHOW_CPLUSPLUS_API */
    425 
    426 #endif  // __STRINGTRIEBUILDER_H__