tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbt_rule.h (11974B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   11/17/99    aliu        Creation.
      8 **********************************************************************
      9 */
     10 #ifndef RBT_RULE_H
     11 #define RBT_RULE_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/uobject.h"
     18 #include "unicode/unistr.h"
     19 #include "unicode/utrans.h"
     20 #include "unicode/unimatch.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 class Replaceable;
     25 class TransliterationRuleData;
     26 class StringMatcher;
     27 class UnicodeFunctor;
     28 
     29 /**
     30 * A transliteration rule used by
     31 * <code>RuleBasedTransliterator</code>.
     32 * <code>TransliterationRule</code> is an immutable object.
     33 *
     34 * <p>A rule consists of an input pattern and an output string.  When
     35 * the input pattern is matched, the output string is emitted.  The
     36 * input pattern consists of zero or more characters which are matched
     37 * exactly (the key) and optional context.  Context must match if it
     38 * is specified.  Context may be specified before the key, after the
     39 * key, or both.  The key, preceding context, and following context
     40 * may contain variables.  Variables represent a set of Unicode
     41 * characters, such as the letters <i>a</i> through <i>z</i>.
     42 * Variables are detected by looking up each character in a supplied
     43 * variable list to see if it has been so defined.
     44 *
     45 * <p>A rule may contain segments in its input string and segment
     46 * references in its output string.  A segment is a substring of the
     47 * input pattern, indicated by an offset and limit.  The segment may
     48 * be in the preceding or following context.  It may not span a
     49 * context boundary.  A segment reference is a special character in
     50 * the output string that causes a segment of the input string (not
     51 * the input pattern) to be copied to the output string.  The range of
     52 * special characters that represent segment references is defined by
     53 * RuleBasedTransliterator.Data.
     54 *
     55 * @author Alan Liu
     56 */
     57 class TransliterationRule : public UMemory {
     58 
     59 private:
     60 
     61    // TODO Eliminate the pattern and keyLength data members.  They
     62    // are used only by masks() and getIndexValue() which are called
     63    // only during build time, not during run-time.  Perhaps these
     64    // methods and pattern/keyLength can be isolated into a separate
     65    // object.
     66 
     67    /**
     68     * The match that must occur before the key, or null if there is no
     69     * preceding context.
     70     */
     71    StringMatcher *anteContext;
     72 
     73    /**
     74     * The matcher object for the key.  If null, then the key is empty.
     75     */
     76    StringMatcher *key;
     77 
     78    /**
     79     * The match that must occur after the key, or null if there is no
     80     * following context.
     81     */
     82    StringMatcher *postContext;
     83 
     84    /**
     85     * The object that performs the replacement if the key,
     86     * anteContext, and postContext are matched.  Never null.
     87     */
     88    UnicodeFunctor* output;
     89 
     90    /**
     91     * The string that must be matched, consisting of the anteContext, key,
     92     * and postContext, concatenated together, in that order.  Some components
     93     * may be empty (zero length).
     94     * @see anteContextLength
     95     * @see keyLength
     96     */
     97    UnicodeString pattern;
     98 
     99    /**
    100     * An array of matcher objects corresponding to the input pattern
    101     * segments.  If there are no segments this is null.  N.B. This is
    102     * a UnicodeMatcher for generality, but in practice it is always a
    103     * StringMatcher.  In the future we may generalize this, but for
    104     * now we sometimes cast down to StringMatcher.
    105     *
    106     * The array is owned, but the pointers within it are not.
    107     */
    108    UnicodeFunctor** segments;
    109 
    110    /**
    111     * The number of elements in segments[] or zero if segments is nullptr.
    112     */
    113    int32_t segmentsCount;
    114 
    115    /**
    116     * The length of the string that must match before the key.  If
    117     * zero, then there is no matching requirement before the key.
    118     * Substring [0,anteContextLength) of pattern is the anteContext.
    119     */
    120    int32_t anteContextLength;
    121 
    122    /**
    123     * The length of the key.  Substring [anteContextLength,
    124     * anteContextLength + keyLength) is the key.
    125 
    126     */
    127    int32_t keyLength;
    128 
    129    /**
    130     * Miscellaneous attributes.
    131     */
    132    int8_t flags;
    133 
    134    /**
    135     * Flag attributes.
    136     */
    137    enum {
    138        ANCHOR_START = 1,
    139        ANCHOR_END   = 2
    140    };
    141 
    142    /**
    143     * An alias pointer to the data for this rule.  The data provides
    144     * lookup services for matchers and segments.
    145     */
    146    const TransliterationRuleData* data;
    147 
    148 public:
    149 
    150    /**
    151     * Construct a new rule with the given input, output text, and other
    152     * attributes.  A cursor position may be specified for the output text.
    153     * @param input          input string, including key and optional ante and
    154     *                       post context.
    155     * @param anteContextPos offset into input to end of ante context, or -1 if
    156     *                       none.  Must be <= input.length() if not -1.
    157     * @param postContextPos offset into input to start of post context, or -1
    158     *                       if none.  Must be <= input.length() if not -1, and must be >=
    159     *                       anteContextPos.
    160     * @param outputStr      output string.
    161     * @param cursorPosition offset into output at which cursor is located, or -1 if
    162     *                       none.  If less than zero, then the cursor is placed after the
    163     *                       <code>output</code>; that is, -1 is equivalent to
    164     *                       <code>output.length()</code>.  If greater than
    165     *                       <code>output.length()</code> then an exception is thrown.
    166     * @param cursorOffset   an offset to be added to cursorPos to position the
    167     *                       cursor either in the ante context, if < 0, or in the post context, if >
    168     *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
    169     *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
    170     *                       of -3.
    171     * @param segs           array of UnicodeMatcher corresponding to input pattern
    172     *                       segments, or null if there are none.  The array itself is adopted,
    173     *                       but the pointers within it are not.
    174     * @param segsCount      number of elements in segs[].
    175     * @param anchorStart    true if the rule is anchored on the left to
    176     *                       the context start.
    177     * @param anchorEnd      true if the rule is anchored on the right to the
    178     *                       context limit.
    179     * @param data           the rule data.
    180     * @param status         Output parameter filled in with success or failure status.
    181     */
    182    TransliterationRule(const UnicodeString& input,
    183                        int32_t anteContextPos, int32_t postContextPos,
    184                        const UnicodeString& outputStr,
    185                        int32_t cursorPosition, int32_t cursorOffset,
    186                        UnicodeFunctor** segs,
    187                        int32_t segsCount,
    188                        UBool anchorStart, UBool anchorEnd,
    189                        const TransliterationRuleData* data,
    190                        UErrorCode& status);
    191 
    192    /**
    193     * Copy constructor.
    194     * @param other    the object to be copied.
    195     */
    196    TransliterationRule(TransliterationRule& other);
    197 
    198    /**
    199     * Destructor.
    200     */
    201    virtual ~TransliterationRule();
    202 
    203    /**
    204     * Change the data object that this rule belongs to.  Used
    205     * internally by the TransliterationRuleData copy constructor.
    206     * @param data    the new data value to be set.
    207     */
    208    void setData(const TransliterationRuleData* data);
    209 
    210    /**
    211     * Return the preceding context length.  This method is needed to
    212     * support the <code>Transliterator</code> method
    213     * <code>getMaximumContextLength()</code>.  Internally, this is
    214     * implemented as the anteContextLength, optionally plus one if
    215     * there is a start anchor.  The one character anchor gap is
    216     * needed to make repeated incremental transliteration with
    217     * anchors work.
    218     * @return    the preceding context length.
    219     */
    220    virtual int32_t getContextLength() const;
    221 
    222    /**
    223     * Internal method.  Returns 8-bit index value for this rule.
    224     * This is the low byte of the first character of the key,
    225     * unless the first character of the key is a set.  If it's a
    226     * set, or otherwise can match multiple keys, the index value is -1.
    227     * @return    8-bit index value for this rule.
    228     */
    229    int16_t getIndexValue() const;
    230 
    231    /**
    232     * Internal method.  Returns true if this rule matches the given
    233     * index value.  The index value is an 8-bit integer, 0..255,
    234     * representing the low byte of the first character of the key.
    235     * It matches this rule if it matches the first character of the
    236     * key, or if the first character of the key is a set, and the set
    237     * contains any character with a low byte equal to the index
    238     * value.  If the rule contains only ante context, as in foo)>bar,
    239     * then it will match any key.
    240     * @param v    the given index value.
    241     * @return     true if this rule matches the given index value.
    242     */
    243    UBool matchesIndexValue(uint8_t v) const;
    244 
    245    /**
    246     * Return true if this rule masks another rule.  If r1 masks r2 then
    247     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
    248     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
    249     * "[c]a>x" masks "[dc]a>y".
    250     * @param r2  the given rule to be compared with.
    251     * @return    true if this rule masks 'r2'
    252     */
    253    virtual UBool masks(const TransliterationRule& r2) const;
    254 
    255    /**
    256     * Attempt a match and replacement at the given position.  Return
    257     * the degree of match between this rule and the given text.  The
    258     * degree of match may be mismatch, a partial match, or a full
    259     * match.  A mismatch means at least one character of the text
    260     * does not match the context or key.  A partial match means some
    261     * context and key characters match, but the text is not long
    262     * enough to match all of them.  A full match means all context
    263     * and key characters match.
    264     * 
    265     * If a full match is obtained, perform a replacement, update pos,
    266     * and return U_MATCH.  Otherwise both text and pos are unchanged.
    267     * 
    268     * @param text the text
    269     * @param pos the position indices
    270     * @param incremental if true, test for partial matches that may
    271     * be completed by additional text inserted at pos.limit.
    272     * @return one of <code>U_MISMATCH</code>,
    273     * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
    274     * incremental is false then U_PARTIAL_MATCH will not be returned.
    275     */
    276    UMatchDegree matchAndReplace(Replaceable& text,
    277                                 UTransPosition& pos,
    278                                 UBool incremental) const;
    279 
    280    /**
    281     * Create a rule string that represents this rule object.  Append
    282     * it to the given string.
    283     */
    284    virtual UnicodeString& toRule(UnicodeString& pat,
    285                                  UBool escapeUnprintable) const;
    286 
    287    /**
    288     * Union the set of all characters that may be modified by this rule
    289     * into the given set.
    290     */
    291    void addSourceSetTo(UnicodeSet& toUnionTo) const;
    292 
    293    /**
    294     * Union the set of all characters that may be emitted by this rule
    295     * into the given set.
    296     */
    297    void addTargetSetTo(UnicodeSet& toUnionTo) const;
    298 
    299 private:
    300 
    301    friend class StringMatcher;
    302 
    303    TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
    304 };
    305 
    306 U_NAMESPACE_END
    307 
    308 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    309 
    310 #endif