tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbisetb.h (6082B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  rbbisetb.h
      5 /*
      6 **********************************************************************
      7 *   Copyright (c) 2001-2005, International Business Machines
      8 *   Corporation and others.  All Rights Reserved.
      9 **********************************************************************
     10 */
     11 
     12 #ifndef RBBISETB_H
     13 #define RBBISETB_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/ucptrie.h"
     20 #include "unicode/umutablecptrie.h"
     21 #include "unicode/uobject.h"
     22 #include "rbbirb.h"
     23 #include "uvector.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 //
     28 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
     29 //                   from the Unicode Sets appearing in the source  RBBI rules, and
     30 //                   creates the TRIE table used to map from Unicode to the
     31 //                   character categories.
     32 //
     33 
     34 
     35 //
     36 //  RangeDescriptor
     37 //
     38 //     Each of the non-overlapping character ranges gets one of these descriptors.
     39 //     All of them are strung together in a linked list, which is kept in order
     40 //     (by character)
     41 //
     42 class RangeDescriptor : public UMemory {
     43 public:
     44    UChar32            fStartChar {};            // Start of range, unicode 32 bit value.
     45    UChar32            fEndChar {};              // End of range, unicode 32 bit value.
     46    int32_t            fNum {0};                 // runtime-mapped input value for this range.
     47    bool               fIncludesDict {false};    // True if the range includes $dictionary.
     48    bool               fFirstInGroup {false};    // True if first range in a group with the same fNum.
     49    UVector           *fIncludesSets {nullptr};  // vector of the original
     50                                                 //   Unicode sets that include this range.
     51                                                 //    (Contains ptrs to uset nodes)
     52    RangeDescriptor   *fNext {nullptr};          // Next RangeDescriptor in the linked list.
     53 
     54    RangeDescriptor(UErrorCode &status);
     55    RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
     56    ~RangeDescriptor();
     57    void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
     58                                        //   where appearing in the second (higher) part.
     59    bool isDictionaryRange();           // Check whether this range appears as part of
     60                                        //   the Unicode set named "dictionary"
     61 
     62    RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
     63    RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
     64 };
     65 
     66 
     67 //
     68 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
     69 //
     70 //      Starting with the rules parse tree from the scanner,
     71 //
     72 //                   -  Enumerate the set of UnicodeSets that are referenced
     73 //                      by the RBBI rules.
     74 //                   -  compute a derived set of non-overlapping UnicodeSets
     75 //                      that will correspond to columns in the state table for
     76 //                      the RBBI execution engine.
     77 //                   -  construct the trie table that maps input characters
     78 //                      to set numbers in the non-overlapping set of sets.
     79 //
     80 
     81 
     82 class RBBISetBuilder : public UMemory {
     83 public:
     84    RBBISetBuilder(RBBIRuleBuilder *rb);
     85    ~RBBISetBuilder();
     86 
     87    void     buildRanges();
     88    void     buildTrie();
     89    void     addValToSets(UVector *sets,      uint32_t val);
     90    void     addValToSet (RBBINode *usetNode, uint32_t val);
     91    int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
     92                                             //    runtime state machine, which are the same as
     93                                             //    columns in the DFA state table
     94    int32_t  getDictCategoriesStart() const; // First char category that includes $dictionary, or
     95                                             // last category + 1 if there are no dictionary categories.
     96    int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
     97    void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
     98    UChar32  getFirstChar(int32_t  val) const;
     99    UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
    100                                             //   character were encountered.
    101    /**
    102     * Merge two character categories that have been identified as having equivalent behavior.
    103     * The ranges belonging to the second category (table column) will be added to the first.
    104     * @param categories the pair of categories to be merged.
    105     */
    106    void     mergeCategories(IntPair categories);
    107 
    108 #ifdef RBBI_DEBUG
    109    void     printSets();
    110    void     printRanges();
    111    void     printRangeGroups();
    112 #else
    113    #define printSets()
    114    #define printRanges()
    115    #define printRangeGroups()
    116 #endif
    117 
    118 private:
    119    RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
    120    UErrorCode            *fStatus;
    121 
    122    RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
    123 
    124    UMutableCPTrie        *fMutableTrie;    // The mapping TRIE that is the end result of processing
    125    UCPTrie               *fTrie;           //  the Unicode Sets.
    126    uint32_t               fTrieSize;
    127 
    128    // Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
    129    int32_t               fGroupCount;
    130 
    131    // The number of the first dictionary char category.
    132    // If there are no Dictionary categories, set to the last category + 1.
    133    int32_t               fDictCategoriesStart;
    134 
    135    UBool                 fSawBOF;
    136 
    137    RBBISetBuilder(const RBBISetBuilder &other) = delete; // forbid copying of this class
    138    RBBISetBuilder &operator=(const RBBISetBuilder &other) = delete; // forbid copying of this class
    139 };
    140 
    141 
    142 
    143 U_NAMESPACE_END
    144 
    145 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    146 
    147 #endif