tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbirb.h (9055B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  rbbirb.h
      5 //
      6 //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains declarations for several classes from the
     10 //    Rule Based Break Iterator rule builder.
     11 //
     12 
     13 
     14 #ifndef RBBIRB_H
     15 #define RBBIRB_H
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_BREAK_ITERATION
     20 
     21 #include <utility>
     22 
     23 #include "unicode/uobject.h"
     24 #include "unicode/rbbi.h"
     25 #include "unicode/uniset.h"
     26 #include "unicode/parseerr.h"
     27 #include "uhash.h"
     28 #include "uvector.h"
     29 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
     30                             //    looks up references to $variables within a set.
     31 
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 class               RBBIRuleScanner;
     36 struct              RBBIRuleTableEl;
     37 class               RBBISetBuilder;
     38 class               RBBINode;
     39 class               RBBITableBuilder;
     40 
     41 
     42 
     43 //--------------------------------------------------------------------------------
     44 //
     45 //   RBBISymbolTable.    Implements SymbolTable interface that is used by the
     46 //                       UnicodeSet parser to resolve references to $variables.
     47 //
     48 //--------------------------------------------------------------------------------
     49 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
     50 public:                                       //   of these structs for each entry.
     51    RBBISymbolTableEntry();
     52    UnicodeString          key;
     53    RBBINode               *val;
     54    ~RBBISymbolTableEntry();
     55 
     56 private:
     57    RBBISymbolTableEntry(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
     58    RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other) = delete; // forbid copying of this class
     59 };
     60 
     61 
     62 class RBBISymbolTable : public UMemory, public SymbolTable {
     63 private:
     64    const UnicodeString      &fRules;
     65    UHashtable               *fHashTable;
     66    RBBIRuleScanner          *fRuleScanner;
     67 
     68    // These next two fields are part of the mechanism for passing references to
     69    //   already-constructed UnicodeSets back to the UnicodeSet constructor
     70    //   when the pattern includes $variable references.
     71    const UnicodeString      ffffString;      // = "/uffff"
     72    UnicodeSet              *fCachedSetLookup;
     73 
     74 public:
     75    //  API inherited from class SymbolTable
     76    virtual const UnicodeString*  lookup(const UnicodeString& s) const override;
     77    virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override;
     78    virtual UnicodeString parseReference(const UnicodeString& text,
     79                                         ParsePosition& pos, int32_t limit) const override;
     80 
     81    //  Additional Functions
     82    RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
     83    virtual ~RBBISymbolTable();
     84 
     85    virtual RBBINode *lookupNode(const UnicodeString &key) const;
     86    virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
     87 
     88 #ifdef RBBI_DEBUG
     89    virtual void      rbbiSymtablePrint() const;
     90 #else
     91    // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
     92    //  or the call sites won't compile.
     93    int32_t fFakeField;
     94    #define rbbiSymtablePrint() fFakeField=0; 
     95 #endif
     96 
     97 private:
     98    RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
     99    RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
    100 };
    101 
    102 
    103 //--------------------------------------------------------------------------------
    104 //
    105 //  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
    106 //
    107 //--------------------------------------------------------------------------------
    108 class RBBIRuleBuilder : public UMemory {
    109 public:
    110 
    111    //  Create a rule based break iterator from a set of rules.
    112    //  This function is the main entry point into the rule builder.  The
    113    //   public ICU API for creating RBBIs uses this function to do the actual work.
    114    //
    115    static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
    116                                    UParseError      *parseError,
    117                                    UErrorCode       &status);
    118 
    119 public:
    120    // The "public" functions and data members that appear below are accessed
    121    //  (and shared) by the various parts that make up the rule builder.  They
    122    //  are NOT intended to be accessed by anything outside of the
    123    //  rule builder implementation.
    124    RBBIRuleBuilder(const UnicodeString  &rules,
    125                    UParseError          *parseErr,
    126                    UErrorCode           &status
    127    );
    128 
    129    virtual    ~RBBIRuleBuilder();
    130 
    131    /**
    132     *  Build the state tables and char class Trie from the source rules.
    133     */
    134    RBBIDataHeader  *build(UErrorCode &status);
    135 
    136 
    137    /**
    138     * Fold together redundant character classes (table columns) and
    139     * redundant states (table rows). Done after initial table generation,
    140     * before serializing the result.
    141     */
    142    void optimizeTables();
    143 
    144    char                          *fDebugEnv;        // controls debug trace output
    145    UErrorCode                    *fStatus;          // Error reporting.  Keeping status
    146    UParseError                   *fParseError;      //   here avoids passing it everywhere.
    147    const UnicodeString           &fRules;           // The rule string that we are compiling
    148    UnicodeString                 fStrippedRules;    // The rule string, with comments stripped.
    149 
    150    RBBIRuleScanner               *fScanner;         // The scanner.
    151    RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
    152    RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
    153    RBBINode                      *fSafeFwdTree;
    154    RBBINode                      *fSafeRevTree;
    155 
    156    RBBINode                      **fDefaultTree;    // For rules not qualified with a !
    157                                                     //   the tree to which they belong to.
    158 
    159    UBool                         fChainRules;       // True for chained Unicode TR style rules.
    160                                                     // False for traditional regexp rules.
    161 
    162    UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
    163                                                     // immediate break, no continuing for the
    164                                                     // longest match.
    165 
    166    RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
    167    UVector                       *fUSetNodes;       // Vector of all uset nodes.
    168 
    169    RBBITableBuilder              *fForwardTable;    // State transition table, build time form.
    170 
    171    UVector                       *fRuleStatusVals;  // The values that can be returned
    172                                                     //   from getRuleStatus().
    173 
    174    RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
    175                                                     // data tables..
    176 private:
    177    RBBIRuleBuilder(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
    178    RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other) = delete; // forbid copying of this class
    179 };
    180 
    181 
    182 
    183 
    184 //----------------------------------------------------------------------------
    185 //
    186 //   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
    187 //                    been encountered.  The val Node will be of nodetype uset
    188 //                    and contain pointers to the actual UnicodeSets.
    189 //                    The Key is the source string for initializing the set.
    190 //
    191 //                    The hash table is used to avoid creating duplicate
    192 //                    unnamed (not $var references) UnicodeSets.
    193 //
    194 //                    Memory Management:
    195 //                       The Hash Table owns these RBBISetTableEl structs and
    196 //                            the key strings.  It does NOT own the val nodes.
    197 //
    198 //----------------------------------------------------------------------------
    199 struct RBBISetTableEl {
    200    UnicodeString *key;
    201    RBBINode      *val;
    202 };
    203 
    204 /**
    205 *   A pair of ints, used to bundle pairs of states or pairs of character classes.
    206 */
    207 typedef std::pair<int32_t, int32_t> IntPair;
    208 
    209 
    210 //----------------------------------------------------------------------------
    211 //
    212 //   RBBIDebugPrintf    Printf equivalent, for debugging output.
    213 //                      Conditional compilation of the implementation lets us
    214 //                      get rid of the stdio dependency in environments where it
    215 //                      is unavailable.
    216 //
    217 //----------------------------------------------------------------------------
    218 #ifdef RBBI_DEBUG
    219 #include <stdio.h>
    220 #define RBBIDebugPrintf printf
    221 #define RBBIDebugPuts puts
    222 #else
    223 #undef RBBIDebugPrintf 
    224 #define RBBIDebugPuts(arg)
    225 #endif
    226 
    227 U_NAMESPACE_END
    228 
    229 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    230 
    231 #endif