tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

search.h (22741B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  03/22/2000   helena      Creation.
      9 **********************************************************************
     10 */
     11 
     12 #ifndef SEARCH_H
     13 #define SEARCH_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if U_SHOW_CPLUSPLUS_API
     18 
     19 /**
     20 * \file 
     21 * \brief C++ API: SearchIterator object.
     22 */
     23 
     24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     25 
     26 #include "unicode/uobject.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/chariter.h"
     29 #include "unicode/brkiter.h"
     30 #include "unicode/usearch.h"
     31 
     32 /**
     33 * @stable ICU 2.0
     34 */
     35 struct USearch;
     36 /**
     37 * @stable ICU 2.0
     38 */
     39 typedef struct USearch USearch;
     40 
     41 U_NAMESPACE_BEGIN
     42 
     43 /**
     44 *
     45 * <tt>SearchIterator</tt> is an abstract base class that provides 
     46 * methods to search for a pattern within a text string. Instances of
     47 * <tt>SearchIterator</tt> maintain a current position and scans over the 
     48 * target text, returning the indices the pattern is matched and the length 
     49 * of each match.
     50 * <p>
     51 * <tt>SearchIterator</tt> defines a protocol for text searching. 
     52 * Subclasses provide concrete implementations of various search algorithms. 
     53 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 
     54 * matching based on the comparison rules defined in a 
     55 * <tt>RuleBasedCollator</tt> object. 
     56 * <p> 
     57 * Other options for searching includes using a BreakIterator to restrict 
     58 * the points at which matches are detected.
     59 * <p>
     60 * <tt>SearchIterator</tt> provides an API that is similar to that of
     61 * other text iteration classes such as <tt>BreakIterator</tt>. Using 
     62 * this class, it is easy to scan through text looking for all occurrences of 
     63 * a given pattern. The following example uses a <tt>StringSearch</tt> 
     64 * object to find all instances of "fox" in the target string. Any other 
     65 * subclass of <tt>SearchIterator</tt> can be used in an identical 
     66 * manner.
     67 * <pre><code>
     68 * UnicodeString target("The quick brown fox jumped over the lazy fox");
     69 * UnicodeString pattern("fox");
     70 *
     71 * SearchIterator *iter  = new StringSearch(pattern, target);
     72 * UErrorCode      error = U_ZERO_ERROR;
     73 * for (int pos = iter->first(error); pos != USEARCH_DONE; 
     74 *                               pos = iter->next(error)) {
     75 *     printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
     76 * }
     77 * </code></pre>
     78 *
     79 * @see StringSearch
     80 * @see RuleBasedCollator
     81 */
     82 class U_I18N_API SearchIterator : public UObject {
     83 
     84 public:
     85 
     86    // public constructors and destructors -------------------------------
     87 
     88    /** 
     89    * Copy constructor that creates a SearchIterator instance with the same 
     90    * behavior, and iterating over the same text. 
     91    * @param other the SearchIterator instance to be copied.
     92    * @stable ICU 2.0
     93    */
     94    SearchIterator(const SearchIterator &other);
     95 
     96    /**
     97     * Destructor. Cleans up the search iterator data struct.
     98     * @stable ICU 2.0
     99     */
    100    virtual ~SearchIterator();
    101 
    102    // public get and set methods ----------------------------------------
    103 
    104    /**
    105     * Sets the index to point to the given position, and clears any state 
    106     * that's affected.
    107     * <p>
    108     * This method takes the argument index and sets the position in the text 
    109     * string accordingly without checking if the index is pointing to a 
    110     * valid starting point to begin searching. 
    111     * @param position within the text to be set. If position is less
    112     *             than or greater than the text range for searching, 
    113     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    114     * @param status for errors if it occurs
    115     * @stable ICU 2.0
    116     */
    117    virtual void setOffset(int32_t position, UErrorCode &status) = 0;
    118 
    119    /**
    120     * Return the current index in the text being searched.
    121     * If the iteration has gone past the end of the text
    122     * (or past the beginning for a backwards search), USEARCH_DONE
    123     * is returned.
    124     * @return current index in the text being searched.
    125     * @stable ICU 2.0
    126     */
    127    virtual int32_t getOffset() const = 0;
    128 
    129    /**
    130    * Sets the text searching attributes located in the enum 
    131    * USearchAttribute with values from the enum USearchAttributeValue.
    132    * USEARCH_DEFAULT can be used for all attributes for resetting.
    133    * @param attribute text attribute (enum USearchAttribute) to be set
    134    * @param value text attribute value
    135    * @param status for errors if it occurs
    136    * @stable ICU 2.0
    137    */
    138    void setAttribute(USearchAttribute       attribute,
    139                      USearchAttributeValue  value,
    140                      UErrorCode            &status);
    141 
    142    /**    
    143    * Gets the text searching attributes
    144    * @param attribute text attribute (enum USearchAttribute) to be retrieve
    145    * @return text attribute value
    146    * @stable ICU 2.0
    147    */
    148    USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
    149    
    150    /**
    151    * Returns the index to the match in the text string that was searched.
    152    * This call returns a valid result only after a successful call to 
    153    * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    154    * Just after construction, or after a searching method returns 
    155    * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
    156    * <p>
    157    * Use getMatchedLength to get the matched string length.
    158    * @return index of a substring within the text string that is being 
    159    *         searched.
    160    * @see #first
    161    * @see #next
    162    * @see #previous
    163    * @see #last
    164    * @stable ICU 2.0
    165    */
    166    int32_t getMatchedStart() const;
    167 
    168    /**
    169     * Returns the length of text in the string which matches the search 
    170     * pattern. This call returns a valid result only after a successful call 
    171     * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    172     * Just after construction, or after a searching method returns 
    173     * <tt>USEARCH_DONE</tt>, this method will return 0.
    174     * @return The length of the match in the target text, or 0 if there
    175     *         is no match currently.
    176     * @see #first
    177     * @see #next
    178     * @see #previous
    179     * @see #last
    180     * @stable ICU 2.0
    181     */
    182    int32_t getMatchedLength() const;
    183 
    184    /**
    185     * Returns the text that was matched by the most recent call to 
    186     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
    187     * If the iterator is not pointing at a valid match (e.g. just after 
    188     * construction or after <tt>USEARCH_DONE</tt> has been returned, 
    189     * returns an empty string. 
    190     * @param result stores the matched string or an empty string if a match
    191     *        is not found.
    192     * @see #first
    193     * @see #next
    194     * @see #previous
    195     * @see #last
    196     * @stable ICU 2.0
    197     */
    198    void getMatchedText(UnicodeString &result) const;
    199    
    200    /**
    201     * Set the BreakIterator that will be used to restrict the points
    202     * at which matches are detected. The user is responsible for deleting 
    203     * the breakiterator.
    204     * @param breakiter A BreakIterator that will be used to restrict the 
    205     *                points at which matches are detected. If a match is 
    206     *                found, but the match's start or end index is not a 
    207     *                boundary as determined by the <tt>BreakIterator</tt>, 
    208     *                the match will be rejected and another will be searched 
    209     *                for. If this parameter is <tt>nullptr</tt>, no break
    210     *                detection is attempted.
    211     * @param status for errors if it occurs
    212     * @see BreakIterator
    213     * @stable ICU 2.0
    214     */
    215    void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
    216    
    217    /**
    218     * Returns the BreakIterator that is used to restrict the points at 
    219     * which matches are detected.  This will be the same object that was 
    220     * passed to the constructor or to <tt>setBreakIterator</tt>.
    221     * Note that <tt>nullptr</tt> is a legal value; it means that break
    222     * detection should not be attempted.
    223     * @return BreakIterator used to restrict matchings.
    224     * @see #setBreakIterator
    225     * @stable ICU 2.0
    226     */
    227    const BreakIterator* getBreakIterator() const;
    228 
    229    /**
    230     * Set the string text to be searched. Text iteration will hence begin at 
    231     * the start of the text string. This method is useful if you want to 
    232     * re-use an iterator to search for the same pattern within a different 
    233     * body of text. The user is responsible for deleting the text.
    234     * @param text string to be searched.
    235     * @param status for errors. If the text length is 0, 
    236     *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
    237     * @stable ICU 2.0
    238     */
    239    virtual void setText(const UnicodeString &text, UErrorCode &status);    
    240 
    241    /**
    242     * Set the string text to be searched. Text iteration will hence begin at 
    243     * the start of the text string. This method is useful if you want to 
    244     * re-use an iterator to search for the same pattern within a different 
    245     * body of text.
    246     * <p>
    247     * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
    248     * will be done during searching for this version. The block of text 
    249     * in <tt>CharacterIterator</tt> will be used as it is.
    250     * The user is responsible for deleting the text.
    251     * @param text string iterator to be searched.
    252     * @param status for errors if any. If the text length is 0 then an 
    253     *        U_ILLEGAL_ARGUMENT_ERROR is returned.
    254     * @stable ICU 2.0
    255     */
    256    virtual void setText(CharacterIterator &text, UErrorCode &status);
    257    
    258    /**
    259     * Return the string text to be searched.
    260     * @return text string to be searched.
    261     * @stable ICU 2.0
    262     */
    263    const UnicodeString& getText() const;
    264 
    265    // operator overloading ----------------------------------------------
    266 
    267    /**
    268     * Equality operator. 
    269     * @param that SearchIterator instance to be compared.
    270     * @return true if both BreakIterators are of the same class, have the 
    271     *         same behavior, terates over the same text and have the same
    272     *         attributes. false otherwise.
    273     * @stable ICU 2.0
    274     */
    275    virtual bool operator==(const SearchIterator &that) const;
    276 
    277    /**
    278     * Not-equal operator. 
    279     * @param that SearchIterator instance to be compared.
    280     * @return false if operator== returns true, and vice versa.
    281     * @stable ICU 2.0
    282     */
    283    bool operator!=(const SearchIterator &that) const;
    284 
    285    // public methods ----------------------------------------------------
    286 
    287    /**
    288     * Returns a copy of SearchIterator with the same behavior, and 
    289     * iterating over the same text, as this one. Note that all data will be
    290     * replicated, except for the text string to be searched.
    291     * @return cloned object
    292     * @stable ICU 2.0
    293     */
    294    virtual SearchIterator* safeClone() const = 0;
    295 
    296    /**
    297     * Returns the first index at which the string text matches the search 
    298     * pattern. The iterator is adjusted so that its current index (as 
    299     * returned by <tt>getOffset</tt>) is the match position if one 
    300     * was found.
    301     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    302     * the iterator will be adjusted to the index USEARCH_DONE
    303     * @param  status for errors if it occurs
    304     * @return The character index of the first match, or 
    305     *         <tt>USEARCH_DONE</tt> if there are no matches.
    306     * @see #getOffset
    307     * @stable ICU 2.0
    308     */
    309    int32_t first(UErrorCode &status);
    310 
    311    /**
    312     * Returns the first index equal or greater than <tt>position</tt> at which the 
    313     * string text matches the search pattern. The iterator is adjusted so 
    314     * that its current index (as returned by <tt>getOffset</tt>) is the 
    315     * match position if one was found.
    316     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
    317     * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
    318     * @param  position where search if to start from. If position is less
    319     *             than or greater than the text range for searching, 
    320     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    321     * @param  status for errors if it occurs
    322     * @return The character index of the first match following 
    323     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 
    324     *         matches.
    325     * @see #getOffset
    326     * @stable ICU 2.0
    327     */
    328    int32_t following(int32_t position, UErrorCode &status);
    329    
    330    /**
    331     * Returns the last index in the target text at which it matches the 
    332     * search pattern. The iterator is adjusted so that its current index 
    333     * (as returned by <tt>getOffset</tt>) is the match position if one was 
    334     * found.
    335     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    336     * the iterator will be adjusted to the index USEARCH_DONE.
    337     * @param  status for errors if it occurs
    338     * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 
    339     *         there are no matches.
    340     * @see #getOffset
    341     * @stable ICU 2.0
    342     */
    343    int32_t last(UErrorCode &status);
    344 
    345    /**
    346     * Returns the first index less than <tt>position</tt> at which the string 
    347     * text matches the search pattern. The iterator is adjusted so that its 
    348     * current index (as returned by <tt>getOffset</tt>) is the match 
    349     * position if one was found. If a match is not found, 
    350     * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 
    351     * adjusted to the index USEARCH_DONE
    352     * <p>
    353     * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
    354     * result match is always less than <tt>position</tt>.
    355     * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
    356     * <tt>position</tt>.
    357     *
    358     * @param  position where search is to start from. If position is less
    359     *             than or greater than the text range for searching, 
    360     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    361     * @param  status for errors if it occurs
    362     * @return The character index of the first match preceding 
    363     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 
    364     *         no matches.
    365     * @see #getOffset
    366     * @stable ICU 2.0
    367     */
    368    int32_t preceding(int32_t position, UErrorCode &status);
    369 
    370    /**
    371     * Returns the index of the next point at which the text matches the
    372     * search pattern, starting from the current position
    373     * The iterator is adjusted so that its current index (as returned by 
    374     * <tt>getOffset</tt>) is the match position if one was found.
    375     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    376     * the iterator will be adjusted to a position after the end of the text 
    377     * string.
    378     * @param  status for errors if it occurs
    379     * @return The index of the next match after the current position,
    380     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    381     * @see #getOffset
    382     * @stable ICU 2.0
    383     */
    384     int32_t next(UErrorCode &status);
    385 
    386    /**
    387     * Returns the index of the previous point at which the string text 
    388     * matches the search pattern, starting at the current position.
    389     * The iterator is adjusted so that its current index (as returned by 
    390     * <tt>getOffset</tt>) is the match position if one was found.
    391     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
    392     * the iterator will be adjusted to the index USEARCH_DONE
    393     * @param  status for errors if it occurs
    394     * @return The index of the previous match before the current position,
    395     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
    396     * @see #getOffset
    397     * @stable ICU 2.0
    398     */
    399    int32_t previous(UErrorCode &status);
    400 
    401    /** 
    402    * Resets the iteration.
    403    * Search will begin at the start of the text string if a forward 
    404    * iteration is initiated before a backwards iteration. Otherwise if a 
    405    * backwards iteration is initiated before a forwards iteration, the 
    406    * search will begin at the end of the text string.    
    407    * @stable ICU 2.0
    408    */
    409    virtual void reset();
    410 
    411 protected:
    412    // protected data members ---------------------------------------------
    413 
    414    /**
    415    * C search data struct
    416    * @stable ICU 2.0
    417    */
    418    USearch *m_search_;
    419 
    420    /**
    421    * Break iterator.
    422    * Currently the C++ breakiterator does not have getRules etc to reproduce
    423    * another in C. Hence we keep the original around and do the verification
    424    * at the end of the match. The user is responsible for deleting this
    425    * break iterator.
    426    * @stable ICU 2.0
    427    */
    428    BreakIterator *m_breakiterator_;
    429    
    430    /**
    431    * Unicode string version of the search text
    432    * @stable ICU 2.0
    433    */
    434    UnicodeString  m_text_;
    435 
    436    // protected constructors and destructors -----------------------------
    437 
    438    /**
    439    * Default constructor.
    440    * Initializes data to the default values.
    441    * @stable ICU 2.0
    442    */
    443    SearchIterator();
    444 
    445    /**
    446     * Constructor for use by subclasses.
    447     * @param text The target text to be searched.
    448     * @param breakiter A {@link BreakIterator} that is used to restrict the 
    449     *                points at which matches are detected. If 
    450     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
    451     *                match, but the match's start or end index is not a 
    452     *                boundary as determined by the <tt>BreakIterator</tt>, 
    453     *                the match is rejected and <tt>handleNext</tt> or 
    454     *                <tt>handlePrev</tt> is called again. If this parameter 
    455     *                is <tt>nullptr</tt>, no break detection is attempted.
    456     * @see #handleNext
    457     * @see #handlePrev
    458     * @stable ICU 2.0
    459     */
    460    SearchIterator(const UnicodeString &text, 
    461                         BreakIterator *breakiter = nullptr);
    462 
    463    /**
    464     * Constructor for use by subclasses.
    465     * <p>
    466     * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
    467     * will be done during searching for this version. The block of text 
    468     * in <tt>CharacterIterator</tt> will be used as it is.
    469     * @param text The target text to be searched.
    470     * @param breakiter A {@link BreakIterator} that is used to restrict the 
    471     *                points at which matches are detected. If 
    472     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
    473     *                match, but the match's start or end index is not a 
    474     *                boundary as determined by the <tt>BreakIterator</tt>, 
    475     *                the match is rejected and <tt>handleNext</tt> or 
    476     *                <tt>handlePrev</tt> is called again. If this parameter 
    477     *                is <tt>nullptr</tt>, no break detection is attempted.
    478     * @see #handleNext
    479     * @see #handlePrev
    480     * @stable ICU 2.0
    481     */
    482    SearchIterator(CharacterIterator &text, BreakIterator *breakiter = nullptr);
    483 
    484    // protected methods --------------------------------------------------
    485 
    486    /**
    487     * Assignment operator. Sets this iterator to have the same behavior,
    488     * and iterate over the same text, as the one passed in.
    489     * @param that instance to be copied.
    490     * @stable ICU 2.0
    491     */
    492    SearchIterator & operator=(const SearchIterator &that);
    493 
    494    /**
    495     * Abstract method which subclasses override to provide the mechanism
    496     * for finding the next match in the target text. This allows different
    497     * subclasses to provide different search algorithms.
    498     * <p>
    499     * If a match is found, the implementation should return the index at
    500     * which the match starts and should call 
    501     * <tt>setMatchLength</tt> with the number of characters 
    502     * in the target text that make up the match. If no match is found, the 
    503     * method should return USEARCH_DONE.
    504     * <p>
    505     * @param position The index in the target text at which the search 
    506     *                 should start.
    507     * @param status for error codes if it occurs.
    508     * @return index at which the match starts, else if match is not found 
    509     *         USEARCH_DONE is returned
    510     * @see #setMatchLength
    511     * @stable ICU 2.0
    512     */
    513    virtual int32_t handleNext(int32_t position, UErrorCode &status) 
    514                                                                         = 0;
    515 
    516    /**
    517     * Abstract method which subclasses override to provide the mechanism for
    518     * finding the previous match in the target text. This allows different
    519     * subclasses to provide different search algorithms.
    520     * <p>
    521     * If a match is found, the implementation should return the index at
    522     * which the match starts and should call 
    523     * <tt>setMatchLength</tt> with the number of characters 
    524     * in the target text that make up the match. If no match is found, the 
    525     * method should return USEARCH_DONE.
    526     * <p>
    527     * @param position The index in the target text at which the search 
    528     *                 should start.
    529     * @param status for error codes if it occurs.
    530     * @return index at which the match starts, else if match is not found 
    531     *         USEARCH_DONE is returned
    532     * @see #setMatchLength
    533     * @stable ICU 2.0
    534     */
    535     virtual int32_t handlePrev(int32_t position, UErrorCode &status) 
    536                                                                         = 0;
    537 
    538    /**
    539     * Sets the length of the currently matched string in the text string to
    540     * be searched.
    541     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    542     * methods should call this when they find a match in the target text.
    543     * @param length length of the matched text.
    544     * @see #handleNext
    545     * @see #handlePrev
    546     * @stable ICU 2.0
    547     */
    548    virtual void setMatchLength(int32_t length);
    549 
    550    /**
    551     * Sets the offset of the currently matched string in the text string to
    552     * be searched.
    553     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
    554     * methods should call this when they find a match in the target text.
    555     * @param position start offset of the matched text.
    556     * @see #handleNext
    557     * @see #handlePrev
    558     * @stable ICU 2.0
    559     */
    560    virtual void setMatchStart(int32_t position);
    561 
    562    /**
    563    * sets match not found 
    564    * @stable ICU 2.0
    565    */
    566    void setMatchNotFound();
    567 };
    568 
    569 inline bool SearchIterator::operator!=(const SearchIterator &that) const
    570 {
    571   return !operator==(that); 
    572 }
    573 U_NAMESPACE_END
    574 
    575 #endif /* #if !UCONFIG_NO_COLLATION */
    576 
    577 #endif /* U_SHOW_CPLUSPLUS_API */
    578 
    579 #endif