tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbbidata.h (9549B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014 International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  rbbidata.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   RBBI data formats  Includes
     16 *
     17 *                          Structs that describes the format of the Binary RBBI data,
     18 *                          as it is stored in ICU's data file.
     19 *
     20 *      RBBIDataWrapper  -  Instances of this class sit between the
     21 *                          raw data structs and the RulesBasedBreakIterator objects
     22 *                          that are created by applications.  The wrapper class
     23 *                          provides reference counting for the underlying data,
     24 *                          and direct pointers to data that would not otherwise
     25 *                          be accessible without ugly pointer arithmetic.  The
     26 *                          wrapper does not attempt to provide any higher level
     27 *                          abstractions for the data itself.
     28 *
     29 *                          There will be only one instance of RBBIDataWrapper for any
     30 *                          set of RBBI run time data being shared by instances
     31 *                          (clones) of RulesBasedBreakIterator.
     32 */
     33 
     34 #ifndef __RBBIDATA_H__
     35 #define __RBBIDATA_H__
     36 
     37 #include "unicode/utypes.h"
     38 #include "unicode/udata.h"
     39 #include "udataswp.h"
     40 
     41 /**
     42 * Swap RBBI data. See udataswp.h.
     43 * @internal
     44 */
     45 U_CAPI int32_t U_EXPORT2
     46 ubrk_swap(const UDataSwapper *ds,
     47          const void *inData, int32_t length, void *outData,
     48          UErrorCode *pErrorCode);
     49 
     50 #ifdef __cplusplus
     51 
     52 #include "unicode/ucptrie.h"
     53 #include "unicode/uobject.h"
     54 #include "unicode/unistr.h"
     55 #include "unicode/uversion.h"
     56 #include "umutex.h"
     57 
     58 
     59 U_NAMESPACE_BEGIN
     60 
     61 // The current RBBI data format version.
     62 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0};
     63 
     64 /*  
     65 *   The following structs map exactly onto the raw data from ICU common data file. 
     66 */
     67 struct RBBIDataHeader {
     68    uint32_t         fMagic;           /*  == 0xbla0                                               */
     69    UVersionInfo     fFormatVersion;   /* Data Format.  Same as the value in struct UDataInfo      */
     70                                       /*   if there is one associated with this data.             */
     71                                       /*     (version originates in rbbi, is copied to UDataInfo) */
     72    uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
     73                                       /*      including all sections, not just the header.        */
     74    uint32_t         fCatCount;        /*  Number of character categories.                         */
     75 
     76    /*                                                                        */
     77    /*  Offsets and sizes of each of the subsections within the RBBI data.    */
     78    /*  All offsets are bytes from the start of the RBBIDataHeader.           */
     79    /*  All sizes are in bytes.                                               */
     80    /*                                                                        */
     81    uint32_t         fFTable;         /*  forward state transition table. */
     82    uint32_t         fFTableLen;
     83    uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
     84    uint32_t         fRTableLen;
     85    uint32_t         fTrie;           /*  Offset to Trie data for character categories */
     86    uint32_t         fTrieLen;
     87    uint32_t         fRuleSource;     /*  Offset to the source for for the break */
     88    uint32_t         fRuleSourceLen;  /*    rules.  Stored char16_t *. */
     89    uint32_t         fStatusTable;    /* Offset to the table of rule status values */
     90    uint32_t         fStatusTableLen;
     91 
     92    uint32_t         fReserved[6];    /*  Reserved for expansion */
     93 
     94 };
     95 
     96 
     97 
     98 template <typename T>
     99 struct RBBIStateTableRowT {
    100    T               fAccepting;    //  Non-zero if this row is for an accepting state.
    101                                   //  Value 0: not an accepting state.
    102                                   //        1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
    103                                   //       >1: Look-ahead match has completed.
    104                                   //           Actual boundary position happened earlier.
    105                                   //           Value here == fLookAhead in earlier
    106                                   //           state, at actual boundary pos.
    107    T               fLookAhead;    //  Non-zero if this row is for a state that
    108                                   //    corresponds to a '/' in the rule source.
    109                                   //    Value is the same as the fAccepting
    110                                   //    value for the rule (which will appear
    111                                   //    in a different state.
    112    T               fTagsIdx;      //  Non-zero if this row covers a {tagged} position
    113                                   //    from a rule.  Value is the index in the
    114                                   //    StatusTable of the set of matching
    115                                   //    tags (rule status values)
    116    T               fNextState[1]; //  Next State, indexed by char category.
    117                                   //    Variable-length array declared with length 1
    118                                   //    to disable bounds checkers.
    119                                   //    Array Size is actually fData->fHeader->fCatCount
    120                                   //    CAUTION:  see RBBITableBuilder::getTableSize()
    121                                   //              before changing anything here.
    122 };
    123 
    124 typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8;
    125 typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16;
    126 
    127 constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1;   // Value constant for RBBIStateTableRow::fAccepting
    128 
    129 union RBBIStateTableRow {
    130  RBBIStateTableRow16 r16;
    131  RBBIStateTableRow8 r8;
    132 };
    133 
    134 struct RBBIStateTable {
    135    uint32_t         fNumStates;            // Number of states.
    136    uint32_t         fRowLen;               // Length of a state table row, in bytes.
    137    uint32_t         fDictCategoriesStart;  // Char category number of the first dictionary
    138                                            //   char class, or the largest category number + 1
    139                                            //   if there are no dictionary categories.
    140    uint32_t         fLookAheadResultsSize; // Size of run-time array required for holding
    141                                            //   look-ahead results. Indexed by row.fLookAhead.
    142    uint32_t         fFlags;                // Option Flags for this state table.
    143    char             fTableData[1];         // First RBBIStateTableRow begins here.
    144                                            //   Variable-length array declared with length 1
    145                                            //   to disable bounds checkers.
    146                                            //   (making it char[] simplifies ugly address
    147                                            //   arithmetic for indexing variable length rows.)
    148 };
    149 
    150 constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
    151 constexpr uint32_t RBBI_BOF_REQUIRED = 2;
    152 constexpr uint32_t RBBI_8BITS_ROWS = 4;
    153 
    154 
    155 /*                                        */
    156 /*   The reference counting wrapper class */
    157 /*                                        */
    158 class RBBIDataWrapper : public UMemory {
    159 public:
    160    enum EDontAdopt {
    161        kDontAdopt
    162    };
    163    RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
    164    RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
    165    RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
    166    ~RBBIDataWrapper();
    167 
    168    static UBool          isDataVersionAcceptable(const UVersionInfo version);
    169 
    170    void                  init0();
    171    void                  init(const RBBIDataHeader *data, UErrorCode &status);
    172    RBBIDataWrapper      *addReference();
    173    void                  removeReference();
    174    bool                  operator ==(const RBBIDataWrapper &other) const;
    175    int32_t               hashCode();
    176    const UnicodeString  &getRuleSourceString() const;
    177    void                  printData();
    178    void                  printTable(const char *heading, const RBBIStateTable *table);
    179 
    180    /*                                     */
    181    /*   Pointers to items within the data */
    182    /*                                     */
    183    const RBBIDataHeader     *fHeader;
    184    const RBBIStateTable     *fForwardTable;
    185    const RBBIStateTable     *fReverseTable;
    186    const char               *fRuleSource;
    187    const int32_t            *fRuleStatusTable; 
    188 
    189    /* number of int32_t values in the rule status table.   Used to sanity check indexing */
    190    int32_t             fStatusMaxIdx;
    191 
    192    UCPTrie             *fTrie;
    193 
    194 private:
    195    u_atomic_int32_t    fRefCount;
    196    UDataMemory        *fUDataMem;
    197    UnicodeString       fRuleString;
    198    UBool               fDontFreeData;
    199 
    200    RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
    201    RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */
    202 };
    203 
    204 
    205 
    206 U_NAMESPACE_END
    207 
    208 U_CFUNC UBool rbbi_cleanup();
    209 
    210 #endif /* C++ */
    211 
    212 #endif