tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

strmatch.h (8808B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 * Copyright (C) 2001-2011, International Business Machines Corporation
      5 * and others. All Rights Reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *   07/23/01    aliu        Creation.
      9 **********************************************************************
     10 */
     11 #ifndef STRMATCH_H
     12 #define STRMATCH_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_TRANSLITERATION
     17 
     18 #include "unicode/unistr.h"
     19 #include "unicode/unifunct.h"
     20 #include "unicode/unimatch.h"
     21 #include "unicode/unirepl.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 class TransliterationRuleData;
     26 
     27 /**
     28 * An object that matches a fixed input string, implementing the
     29 * UnicodeMatcher API.  This object also implements the
     30 * UnicodeReplacer API, allowing it to emit the matched text as
     31 * output.  Since the match text may contain flexible match elements,
     32 * such as UnicodeSets, the emitted text is not the match pattern, but
     33 * instead a substring of the actual matched text.  Following
     34 * convention, the output text is the leftmost match seen up to this
     35 * point.
     36 *
     37 * A StringMatcher may represent a segment, in which case it has a
     38 * positive segment number.  This affects how the matcher converts
     39 * itself to a pattern but does not otherwise affect its function.
     40 *
     41 * A StringMatcher that is not a segment should not be used as a
     42 * UnicodeReplacer.
     43 */
     44 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
     45 
     46 public:
     47 
     48    /**
     49     * Construct a matcher that matches the given pattern string.
     50     * @param string the pattern to be matched, possibly containing
     51     * stand-ins that represent nested UnicodeMatcher objects.
     52     * @param start inclusive start index of text to be replaced
     53     * @param limit exclusive end index of text to be replaced;
     54     * must be greater than or equal to start
     55     * @param segmentNum the segment number from 1..n, or 0 if this is
     56     * not a segment.
     57     * @param data context object mapping stand-ins to
     58     * UnicodeMatcher objects.
     59     */
     60    StringMatcher(const UnicodeString& string,
     61                  int32_t start,
     62                  int32_t limit,
     63                  int32_t segmentNum,
     64                  const TransliterationRuleData& data);
     65 
     66    /**
     67     * Copy constructor
     68     * @param o  the object to be copied.
     69     */
     70    StringMatcher(const StringMatcher& o);
     71        
     72    /**
     73     * Destructor
     74     */
     75    virtual ~StringMatcher();
     76 
     77    /**
     78     * Implement UnicodeFunctor
     79     * @return a copy of the object.
     80     */
     81    virtual StringMatcher* clone() const override;
     82 
     83    /**
     84     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     85     * and return the pointer.
     86     * @return the UnicodeMatcher point.
     87     */
     88    virtual UnicodeMatcher* toMatcher() const override;
     89 
     90    /**
     91     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     92     * and return the pointer.
     93     * @return the UnicodeReplacer pointer.
     94     */
     95    virtual UnicodeReplacer* toReplacer() const override;
     96 
     97    /**
     98     * Implement UnicodeMatcher
     99     * @param text the text to be matched
    100     * @param offset on input, the index into text at which to begin
    101     * matching.  On output, the limit of the matched text.  The
    102     * number of matched characters is the output value of offset
    103     * minus the input value.  Offset should always point to the
    104     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
    105     * both on entry and upon return.
    106     * @param limit the limit index of text to be matched.  Greater
    107     * than offset for a forward direction match, less than offset for
    108     * a backward direction match.  The last character to be
    109     * considered for matching will be text.charAt(limit-1) in the
    110     * forward direction or text.charAt(limit+1) in the backward
    111     * direction.
    112     * @param incremental  if true, then assume further characters may
    113     * be inserted at limit and check for partial matching.  Otherwise
    114     * assume the text as given is complete.
    115     * @return a match degree value indicating a full match, a partial
    116     * match, or a mismatch.  If incremental is false then
    117     * U_PARTIAL_MATCH should never be returned.
    118     */
    119    virtual UMatchDegree matches(const Replaceable& text,
    120                                 int32_t& offset,
    121                                 int32_t limit,
    122                                 UBool incremental) override;
    123 
    124    /**
    125     * Implement UnicodeMatcher
    126     * @param result            Output param to receive the pattern.
    127     * @param escapeUnprintable if True then escape the unprintable characters.
    128     * @return                  A reference to 'result'.
    129     */
    130    virtual UnicodeString& toPattern(UnicodeString& result,
    131                                     UBool escapeUnprintable = false) const override;
    132 
    133    /**
    134     * Implement UnicodeMatcher
    135     * Returns true if this matcher will match a character c, where c
    136     * & 0xFF == v, at offset, in the forward direction (with limit >
    137     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
    138     * indexing.
    139     * @param v    the given value
    140     * @return     true if this matcher will match a character c, 
    141     *             where c & 0xFF == v
    142     */
    143    virtual UBool matchesIndexValue(uint8_t v) const override;
    144 
    145    /**
    146     * Implement UnicodeMatcher
    147     */
    148    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
    149 
    150    /**
    151     * Implement UnicodeFunctor
    152     */
    153    virtual void setData(const TransliterationRuleData*) override;
    154 
    155    /**
    156     * Replace characters in 'text' from 'start' to 'limit' with the
    157     * output text of this object.  Update the 'cursor' parameter to
    158     * give the cursor position and return the length of the
    159     * replacement text.
    160     *
    161     * @param text the text to be matched
    162     * @param start inclusive start index of text to be replaced
    163     * @param limit exclusive end index of text to be replaced;
    164     * must be greater than or equal to start
    165     * @param cursor output parameter for the cursor position.
    166     * Not all replacer objects will update this, but in a complete
    167     * tree of replacer objects, representing the entire output side
    168     * of a transliteration rule, at least one must update it.
    169     * @return the number of 16-bit code units in the text replacing
    170     * the characters at offsets start..(limit-1) in text
    171     */
    172    virtual int32_t replace(Replaceable& text,
    173                            int32_t start,
    174                            int32_t limit,
    175                            int32_t& cursor) override;
    176 
    177    /**
    178     * Returns a string representation of this replacer.  If the
    179     * result of calling this function is passed to the appropriate
    180     * parser, typically TransliteratorParser, it will produce another
    181     * replacer that is equal to this one.
    182     * @param result the string to receive the pattern.  Previous
    183     * contents will be deleted.
    184     * @param escapeUnprintable if true then convert unprintable
    185     * character to their hex escape representations, \\uxxxx or
    186     * \\Uxxxxxxxx.  Unprintable characters are defined by
    187     * Utility.isUnprintable().
    188     * @return a reference to 'result'.
    189     */
    190    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
    191                                             UBool escapeUnprintable) const override;
    192 
    193    /**
    194     * Remove any match data.  This must be called before performing a
    195     * set of matches with this segment.
    196     */
    197    void resetMatch();
    198 
    199    /**
    200     * ICU "poor man's RTTI", returns a UClassID for the actual class.
    201     */
    202    virtual UClassID getDynamicClassID() const override;
    203 
    204    /**
    205     * ICU "poor man's RTTI", returns a UClassID for this class.
    206     */
    207    static UClassID U_EXPORT2 getStaticClassID();
    208 
    209    /**
    210     * Union the set of all characters that may output by this object
    211     * into the given set.
    212     * @param toUnionTo the set into which to union the output characters
    213     */
    214    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const override;
    215 
    216 private:
    217 
    218    /**
    219     * The text to be matched.
    220     */
    221    UnicodeString pattern;
    222 
    223    /**
    224     * Context object that maps stand-ins to matcher and replacer
    225     * objects.
    226     */
    227    const TransliterationRuleData* data;
    228 
    229    /**
    230     * The segment number, 1-based, or 0 if not a segment.
    231     */
    232    int32_t segmentNumber;
    233 
    234    /**
    235     * Start offset, in the match text, of the <em>rightmost</em>
    236     * match.
    237     */
    238    int32_t matchStart;
    239 
    240    /**
    241     * Limit offset, in the match text, of the <em>rightmost</em>
    242     * match.
    243     */
    244    int32_t matchLimit;
    245 
    246 };
    247 
    248 U_NAMESPACE_END
    249 
    250 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    251 
    252 #endif