tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

usearch.h (40153B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2011,2014 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  06/28/2001   synwee      Creation.
      9 **********************************************************************
     10 */
     11 #ifndef USEARCH_H
     12 #define USEARCH_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/ucol.h"
     19 #include "unicode/ucoleitr.h"
     20 #include "unicode/ubrk.h"
     21 
     22 #if U_SHOW_CPLUSPLUS_API
     23 #include "unicode/localpointer.h"
     24 #endif   // U_SHOW_CPLUSPLUS_API
     25 
     26 /**
     27 * \file
     28 * \brief C API: StringSearch
     29 *
     30 * C APIs for an engine that provides language-sensitive text searching based 
     31 * on the comparison rules defined in a <code>UCollator</code> data struct,
     32 * see <code>ucol.h</code>. This ensures that language eccentricity can be 
     33 * handled, e.g. for the German collator, characters &szlig; and SS will be matched 
     34 * if case is chosen to be ignored. 
     35 * See the <a href="https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm">
     36 * "ICU Collation Design Document"</a> for more information.
     37 * <p> 
     38 * As of ICU4C 4.0 / ICU4J 53, the implementation uses a linear search. In previous versions,
     39 * a modified form of the Boyer-Moore searching algorithm was used. For more information
     40 * on the modified Boyer-Moore algorithm see
     41 * <a href="http://icu-project.org/docs/papers/efficient_text_searching_in_java.html">
     42 * "Efficient Text Searching in Java"</a>, published in <i>Java Report</i> 
     43 * in February, 1999.
     44 * <p>
     45 * There are 2 match options for selection:<br>
     46 * Let S' be the sub-string of a text string S between the offsets start and 
     47 * end <start, end>.
     48 * <br>
     49 * A pattern string P matches a text string S at the offsets <start, end> 
     50 * if
     51 * <pre> 
     52 * option 1. Some canonical equivalent of P matches some canonical equivalent 
     53 *           of S'
     54 * option 2. P matches S' and if P starts or ends with a combining mark, 
     55 *           there exists no non-ignorable combining mark before or after S' 
     56 *           in S respectively. 
     57 * </pre>
     58 * Option 2. will be the default.
     59 * <p>
     60 * This search has APIs similar to that of other text iteration mechanisms 
     61 * such as the break iterators in <code>ubrk.h</code>. Using these 
     62 * APIs, it is easy to scan through text looking for all occurrences of 
     63 * a given pattern. This search iterator allows changing of direction by 
     64 * calling a <code>reset</code> followed by a <code>next</code> or <code>previous</code>. 
     65 * Though a direction change can occur without calling <code>reset</code> first,  
     66 * this operation comes with some speed penalty.
     67 * Generally, match results in the forward direction will match the result 
     68 * matches in the backwards direction in the reverse order
     69 * <p>
     70 * <code>usearch.h</code> provides APIs to specify the starting position 
     71 * within the text string to be searched, e.g. <code>usearch_setOffset</code>,
     72 * <code>usearch_preceding</code> and <code>usearch_following</code>. Since the 
     73 * starting position will be set as it is specified, please take note that 
     74 * there are some dangerous positions which the search may render incorrect 
     75 * results:
     76 * <ul>
     77 * <li> The midst of a substring that requires normalization.
     78 * <li> If the following match is to be found, the position should not be the
     79 *      second character which requires to be swapped with the preceding 
     80 *      character. Vice versa, if the preceding match is to be found, 
     81 *      position to search from should not be the first character which 
     82 *      requires to be swapped with the next character. E.g certain Thai and
     83 *      Lao characters require swapping.
     84 * <li> If a following pattern match is to be found, any position within a 
     85 *      contracting sequence except the first will fail. Vice versa if a 
     86 *      preceding pattern match is to be found, a invalid starting point 
     87 *      would be any character within a contracting sequence except the last.
     88 * </ul>
     89 * <p>
     90 * A breakiterator can be used if only matches at logical breaks are desired.
     91 * Using a breakiterator will only give you results that exactly matches the
     92 * boundaries given by the breakiterator. For instance the pattern "e" will
     93 * not be found in the string "\u00e9" if a character break iterator is used.
     94 * <p>
     95 * Options are provided to handle overlapping matches. 
     96 * E.g. In English, overlapping matches produces the result 0 and 2 
     97 * for the pattern "abab" in the text "ababab", where else mutually 
     98 * exclusive matches only produce the result of 0.
     99 * <p>
    100 * Options are also provided to implement "asymmetric search" as described in
    101 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
    102 * UTS #10 Unicode Collation Algorithm</a>, specifically the USearchAttribute
    103 * USEARCH_ELEMENT_COMPARISON and its values.
    104 * <p>
    105 * Though collator attributes will be taken into consideration while 
    106 * performing matches, there are no APIs here for setting and getting the 
    107 * attributes. These attributes can be set by getting the collator
    108 * from <code>usearch_getCollator</code> and using the APIs in <code>ucol.h</code>.
    109 * Lastly to update String Search to the new collator attributes, 
    110 * usearch_reset() has to be called.
    111 * <p> 
    112 * Restriction: <br>
    113 * Currently there are no composite characters that consists of a
    114 * character with combining class > 0 before a character with combining 
    115 * class == 0. However, if such a character exists in the future, the 
    116 * search mechanism does not guarantee the results for option 1.
    117 * 
    118 * <p>
    119 * Example of use:<br>
    120 * <pre><code>
    121 * char *tgtstr = "The quick brown fox jumped over the lazy fox";
    122 * char *patstr = "fox";
    123 * UChar target[64];
    124 * UChar pattern[16];
    125 * UErrorCode status = U_ZERO_ERROR;
    126 * u_uastrcpy(target, tgtstr);
    127 * u_uastrcpy(pattern, patstr);
    128 *
    129 * UStringSearch *search = usearch_open(pattern, -1, target, -1, "en_US", 
    130 *                                  NULL, &status);
    131 * if (U_SUCCESS(status)) {
    132 *     for (int pos = usearch_first(search, &status); 
    133 *          pos != USEARCH_DONE; 
    134 *          pos = usearch_next(search, &status))
    135 *     {
    136 *         printf("Found match at %d pos, length is %d\n", pos, 
    137 *                                        usearch_getMatchedLength(search));
    138 *     }
    139 * }
    140 *
    141 * usearch_close(search);
    142 * </code></pre>
    143 * @stable ICU 2.4
    144 */
    145 
    146 /**
    147 * DONE is returned by previous() and next() after all valid matches have 
    148 * been returned, and by first() and last() if there are no matches at all.
    149 * @stable ICU 2.4
    150 */
    151 #define USEARCH_DONE -1
    152 
    153 /**
    154 * Data structure for searching
    155 * @stable ICU 2.4
    156 */
    157 struct UStringSearch;
    158 /**
    159 * Data structure for searching
    160 * @stable ICU 2.4
    161 */
    162 typedef struct UStringSearch UStringSearch;
    163 
    164 /**
    165 * @stable ICU 2.4
    166 */
    167 typedef enum {
    168    /**
    169     * Option for overlapping matches
    170     * @stable ICU 2.4
    171     */
    172    USEARCH_OVERLAP = 0,
    173 #ifndef U_HIDE_DEPRECATED_API
    174    /** 
    175     * Option for canonical matches; option 1 in header documentation.
    176     * The default value will be USEARCH_OFF.
    177     * Note: Setting this option to USEARCH_ON currently has no effect on
    178     * search behavior, and this option is deprecated. Instead, to control
    179     * canonical match behavior, you must set UCOL_NORMALIZATION_MODE
    180     * appropriately (to UCOL_OFF or UCOL_ON) in the UCollator used by
    181     * the UStringSearch object.
    182     * @see usearch_openFromCollator 
    183     * @see usearch_getCollator
    184     * @see usearch_setCollator
    185     * @see ucol_getAttribute
    186     * @deprecated ICU 53
    187     */
    188    USEARCH_CANONICAL_MATCH = 1,
    189 #endif  /* U_HIDE_DEPRECATED_API */
    190    /** 
    191     * Option to control how collation elements are compared.
    192     * The default value will be USEARCH_STANDARD_ELEMENT_COMPARISON.
    193     * @stable ICU 4.4
    194     */
    195    USEARCH_ELEMENT_COMPARISON = 2,
    196 
    197 #ifndef U_HIDE_DEPRECATED_API
    198    /**
    199     * One more than the highest normal USearchAttribute value.
    200     * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
    201     */
    202    USEARCH_ATTRIBUTE_COUNT = 3
    203 #endif  /* U_HIDE_DEPRECATED_API */
    204 } USearchAttribute;
    205 
    206 /**
    207 * @stable ICU 2.4
    208 */
    209 typedef enum {
    210    /** 
    211     * Default value for any USearchAttribute
    212     * @stable ICU 2.4
    213     */
    214    USEARCH_DEFAULT = -1,
    215    /**
    216     * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
    217     * @stable ICU 2.4
    218     */
    219    USEARCH_OFF, 
    220    /**
    221     * Value for USEARCH_OVERLAP and USEARCH_CANONICAL_MATCH
    222     * @stable ICU 2.4
    223     */
    224    USEARCH_ON,
    225    /** 
    226     * Value (default) for USEARCH_ELEMENT_COMPARISON;
    227     * standard collation element comparison at the specified collator
    228     * strength.
    229     * @stable ICU 4.4
    230     */
    231    USEARCH_STANDARD_ELEMENT_COMPARISON,
    232    /** 
    233     * Value for USEARCH_ELEMENT_COMPARISON;
    234     * collation element comparison is modified to effectively provide
    235     * behavior between the specified strength and strength - 1. Collation
    236     * elements in the pattern that have the base weight for the specified
    237     * strength are treated as "wildcards" that match an element with any
    238     * other weight at that collation level in the searched text. For
    239     * example, with a secondary-strength English collator, a plain 'e' in
    240     * the pattern will match a plain e or an e with any diacritic in the
    241     * searched text, but an e with diacritic in the pattern will only
    242     * match an e with the same diacritic in the searched text.
    243     *
    244     * This supports "asymmetric search" as described in
    245     * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
    246     * UTS #10 Unicode Collation Algorithm</a>.
    247     *
    248     * @stable ICU 4.4
    249     */
    250    USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD,
    251    /** 
    252     * Value for USEARCH_ELEMENT_COMPARISON.
    253     * collation element comparison is modified to effectively provide
    254     * behavior between the specified strength and strength - 1. Collation
    255     * elements in either the pattern or the searched text that have the
    256     * base weight for the specified strength are treated as "wildcards"
    257     * that match an element with any other weight at that collation level.
    258     * For example, with a secondary-strength English collator, a plain 'e'
    259     * in the pattern will match a plain e or an e with any diacritic in the
    260     * searched text, but an e with diacritic in the pattern will only
    261     * match an e with the same diacritic or a plain e in the searched text.
    262     *
    263     * This option is similar to "asymmetric search" as described in
    264     * [UTS #10 Unicode Collation Algorithm](http://www.unicode.org/reports/tr10/#Asymmetric_Search),
    265     * but also allows unmarked characters in the searched text to match
    266     * marked or unmarked versions of that character in the pattern.
    267     *
    268     * @stable ICU 4.4
    269     */
    270    USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD,
    271 
    272 #ifndef U_HIDE_DEPRECATED_API
    273    /**
    274     * One more than the highest normal USearchAttributeValue value.
    275     * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
    276     */
    277    USEARCH_ATTRIBUTE_VALUE_COUNT
    278 #endif  /* U_HIDE_DEPRECATED_API */
    279 } USearchAttributeValue;
    280 
    281 /* open and close ------------------------------------------------------ */
    282 
    283 /**
    284 * Creates a String Search iterator data struct using the argument locale language
    285 * rule set. A collator will be created in the process, which will be owned by
    286 * this String Search and will be deleted in <code>usearch_close</code>.
    287 *
    288 * The UStringSearch retains a pointer to both the pattern and text strings.
    289 * The caller must not modify or delete them while using the UStringSearch.
    290 *
    291 * @param pattern for matching
    292 * @param patternlength length of the pattern, -1 for null-termination
    293 * @param text text string
    294 * @param textlength length of the text string, -1 for null-termination
    295 * @param locale name of locale for the rules to be used
    296 * @param breakiter A BreakIterator that will be used to restrict the points
    297 *                  at which matches are detected. If a match is found, but 
    298 *                  the match's start or end index is not a boundary as 
    299 *                  determined by the <code>BreakIterator</code>, the match will 
    300 *                  be rejected and another will be searched for. 
    301 *                  If this parameter is <code>NULL</code>, no break detection is 
    302 *                  attempted.
    303 * @param status for errors if it occurs. If pattern or text is NULL, or if
    304 *               patternlength or textlength is 0 then an 
    305 *               U_ILLEGAL_ARGUMENT_ERROR is returned.
    306 * @return search iterator data structure, or NULL if there is an error.
    307 * @stable ICU 2.4
    308 */
    309 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar    *pattern,
    310                                              int32_t         patternlength,
    311                                        const UChar          *text,
    312                                              int32_t         textlength,
    313                                        const char           *locale,
    314                                              UBreakIterator *breakiter,
    315                                              UErrorCode     *status);
    316 
    317 /**
    318 * Creates a String Search iterator data struct using the argument collator language
    319 * rule set. Note, user retains the ownership of this collator, thus the
    320 * responsibility of deletion lies with the user.
    321 
    322 * NOTE: String Search cannot be instantiated from a collator that has
    323 * collate digits as numbers (CODAN) turned on (UCOL_NUMERIC_COLLATION).
    324 *
    325 * The UStringSearch retains a pointer to both the pattern and text strings.
    326 * The caller must not modify or delete them while using the UStringSearch.
    327 *
    328 * @param pattern for matching
    329 * @param patternlength length of the pattern, -1 for null-termination
    330 * @param text text string
    331 * @param textlength length of the text string, -1 for null-termination
    332 * @param collator used for the language rules
    333 * @param breakiter A BreakIterator that will be used to restrict the points
    334 *                  at which matches are detected. If a match is found, but
    335 *                  the match's start or end index is not a boundary as
    336 *                  determined by the <code>BreakIterator</code>, the match will
    337 *                  be rejected and another will be searched for.
    338 *                  If this parameter is <code>NULL</code>, no break detection is
    339 *                  attempted.
    340 * @param status for errors if it occurs. If collator, pattern or text is NULL,
    341 *               or if patternlength or textlength is 0 then an
    342 *               U_ILLEGAL_ARGUMENT_ERROR is returned.
    343 * @return search iterator data structure, or NULL if there is an error.
    344 * @stable ICU 2.4
    345 */
    346 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
    347                                         const UChar          *pattern,
    348                                               int32_t         patternlength,
    349                                         const UChar          *text,
    350                                               int32_t         textlength,
    351                                         const UCollator      *collator,
    352                                               UBreakIterator *breakiter,
    353                                               UErrorCode     *status);
    354 
    355 /**
    356 * Destroys and cleans up the String Search iterator data struct.
    357 * If a collator was created in <code>usearch_open</code>, then it will be destroyed here.
    358 * @param searchiter The UStringSearch to clean up
    359 * @stable ICU 2.4
    360 */
    361 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *searchiter);
    362 
    363 #if U_SHOW_CPLUSPLUS_API
    364 
    365 U_NAMESPACE_BEGIN
    366 
    367 /**
    368 * \class LocalUStringSearchPointer
    369 * "Smart pointer" class, closes a UStringSearch via usearch_close().
    370 * For most methods see the LocalPointerBase base class.
    371 *
    372 * @see LocalPointerBase
    373 * @see LocalPointer
    374 * @stable ICU 4.4
    375 */
    376 U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringSearchPointer, UStringSearch, usearch_close);
    377 
    378 U_NAMESPACE_END
    379 
    380 #endif
    381 
    382 /* get and set methods -------------------------------------------------- */
    383 
    384 /**
    385 * Sets the current position in the text string which the next search will 
    386 * start from. Clears previous states. 
    387 * This method takes the argument index and sets the position in the text 
    388 * string accordingly without checking if the index is pointing to a 
    389 * valid starting point to begin searching. 
    390 * Search positions that may render incorrect results are highlighted in the
    391 * header comments
    392 * @param strsrch search iterator data struct
    393 * @param position position to start next search from. If position is less
    394 *          than or greater than the text range for searching, 
    395 *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    396 * @param status error status if any.
    397 * @stable ICU 2.4
    398 */
    399 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
    400                                        int32_t        position,
    401                                        UErrorCode    *status);
    402 
    403 /**
    404 * Return the current index in the string text being searched.
    405 * If the iteration has gone past the end of the text (or past the beginning 
    406 * for a backwards search), <code>USEARCH_DONE</code> is returned.
    407 * @param strsrch search iterator data struct
    408 * @see #USEARCH_DONE
    409 * @stable ICU 2.4
    410 */
    411 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch);
    412    
    413 /**
    414 * Sets the text searching attributes located in the enum USearchAttribute
    415 * with values from the enum USearchAttributeValue.
    416 * <code>USEARCH_DEFAULT</code> can be used for all attributes for resetting.
    417 * @param strsrch search iterator data struct
    418 * @param attribute text attribute to be set
    419 * @param value text attribute value
    420 * @param status for errors if it occurs
    421 * @see #usearch_getAttribute
    422 * @stable ICU 2.4
    423 */
    424 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch         *strsrch,
    425                                           USearchAttribute       attribute,
    426                                           USearchAttributeValue  value,
    427                                           UErrorCode            *status);
    428 
    429 /**    
    430 * Gets the text searching attributes.
    431 * @param strsrch search iterator data struct
    432 * @param attribute text attribute to be retrieve
    433 * @return text attribute value
    434 * @see #usearch_setAttribute
    435 * @stable ICU 2.4
    436 */
    437 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
    438                                         const UStringSearch    *strsrch,
    439                                               USearchAttribute  attribute);
    440 
    441 /**
    442 * Returns the index to the match in the text string that was searched.
    443 * This call returns a valid result only after a successful call to 
    444 * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>, 
    445 * or <code>usearch_last</code>.
    446 * Just after construction, or after a searching method returns 
    447 * <code>USEARCH_DONE</code>, this method will return <code>USEARCH_DONE</code>.
    448 * <p>
    449 * Use <code>usearch_getMatchedLength</code> to get the matched string length.
    450 * @param strsrch search iterator data struct
    451 * @return index to a substring within the text string that is being 
    452 *         searched.
    453 * @see #usearch_first
    454 * @see #usearch_next
    455 * @see #usearch_previous
    456 * @see #usearch_last
    457 * @see #USEARCH_DONE
    458 * @stable ICU 2.4
    459 */
    460 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
    461                                               const UStringSearch *strsrch);
    462    
    463 /**
    464 * Returns the length of text in the string which matches the search pattern. 
    465 * This call returns a valid result only after a successful call to 
    466 * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>, 
    467 * or <code>usearch_last</code>.
    468 * Just after construction, or after a searching method returns 
    469 * <code>USEARCH_DONE</code>, this method will return 0.
    470 * @param strsrch search iterator data struct
    471 * @return The length of the match in the string text, or 0 if there is no 
    472 *         match currently.
    473 * @see #usearch_first
    474 * @see #usearch_next
    475 * @see #usearch_previous
    476 * @see #usearch_last
    477 * @see #USEARCH_DONE
    478 * @stable ICU 2.4
    479 */
    480 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
    481                                               const UStringSearch *strsrch);
    482 
    483 /**
    484 * Returns the text that was matched by the most recent call to 
    485 * <code>usearch_first</code>, <code>usearch_next</code>, <code>usearch_previous</code>, 
    486 * or <code>usearch_last</code>.
    487 * If the iterator is not pointing at a valid match (e.g. just after 
    488 * construction or after <code>USEARCH_DONE</code> has been returned, returns
    489 * an empty string. If result is not large enough to store the matched text,
    490 * result will be filled with the partial text and an U_BUFFER_OVERFLOW_ERROR 
    491 * will be returned in status. result will be null-terminated whenever 
    492 * possible. If the buffer fits the matched text exactly, a null-termination 
    493 * is not possible, then a U_STRING_NOT_TERMINATED_ERROR set in status.
    494 * Pre-flighting can be either done with length = 0 or the API 
    495 * <code>usearch_getMatchedLength</code>.
    496 * @param strsrch search iterator data struct
    497 * @param result UChar buffer to store the matched string
    498 * @param resultCapacity length of the result buffer
    499 * @param status error returned if result is not large enough
    500 * @return exact length of the matched text, not counting the null-termination
    501 * @see #usearch_first
    502 * @see #usearch_next
    503 * @see #usearch_previous
    504 * @see #usearch_last
    505 * @see #USEARCH_DONE
    506 * @stable ICU 2.4
    507 */
    508 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 
    509                                            UChar         *result, 
    510                                            int32_t        resultCapacity, 
    511                                            UErrorCode    *status);
    512 
    513 #if !UCONFIG_NO_BREAK_ITERATION
    514 
    515 /**
    516 * Set the BreakIterator that will be used to restrict the points at which 
    517 * matches are detected.
    518 * @param strsrch search iterator data struct
    519 * @param breakiter A BreakIterator that will be used to restrict the points
    520 *                  at which matches are detected. If a match is found, but 
    521 *                  the match's start or end index is not a boundary as 
    522 *                  determined by the <code>BreakIterator</code>, the match will 
    523 *                  be rejected and another will be searched for. 
    524 *                  If this parameter is <code>NULL</code>, no break detection is 
    525 *                  attempted.
    526 * @param status for errors if it occurs
    527 * @see #usearch_getBreakIterator
    528 * @stable ICU 2.4
    529 */
    530 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch, 
    531                                               UBreakIterator *breakiter,
    532                                               UErrorCode     *status);
    533 
    534 /**
    535 * Returns the BreakIterator that is used to restrict the points at which 
    536 * matches are detected. This will be the same object that was passed to the 
    537 * constructor or to <code>usearch_setBreakIterator</code>. Note that 
    538 * <code>NULL</code> 
    539 * is a legal value; it means that break detection should not be attempted.
    540 * @param strsrch search iterator data struct
    541 * @return break iterator used
    542 * @see #usearch_setBreakIterator
    543 * @stable ICU 2.4
    544 */
    545 U_CAPI const UBreakIterator * U_EXPORT2 usearch_getBreakIterator(
    546                                              const UStringSearch *strsrch);
    547    
    548 #endif
    549 
    550 /**
    551 * Set the string text to be searched. Text iteration will hence begin at the 
    552 * start of the text string. This method is useful if you want to re-use an 
    553 * iterator to search for the same pattern within a different body of text.
    554 *
    555 * The UStringSearch retains a pointer to the text string. The caller must not
    556 * modify or delete the string while using the UStringSearch.
    557 *
    558 * @param strsrch search iterator data struct
    559 * @param text new string to look for match
    560 * @param textlength length of the new string, -1 for null-termination
    561 * @param status for errors if it occurs. If text is NULL, or textlength is 0 
    562 *               then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change
    563 *               done to strsrch.
    564 * @see #usearch_getText
    565 * @stable ICU 2.4
    566 */
    567 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch, 
    568                                      const UChar         *text,
    569                                            int32_t        textlength,
    570                                            UErrorCode    *status);
    571 
    572 /**
    573 * Return the string text to be searched.
    574 * @param strsrch search iterator data struct
    575 * @param length returned string text length
    576 * @return string text 
    577 * @see #usearch_setText
    578 * @stable ICU 2.4
    579 */
    580 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 
    581                                               int32_t       *length);
    582 
    583 /**
    584 * Gets the collator used for the language rules. 
    585 * <p>
    586 * Deleting the returned <code>UCollator</code> before calling 
    587 * <code>usearch_close</code> would cause the string search to fail.
    588 * <code>usearch_close</code> will delete the collator if this search owns it.
    589 * @param strsrch search iterator data struct
    590 * @return collator
    591 * @stable ICU 2.4
    592 */
    593 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(
    594                                               const UStringSearch *strsrch);
    595 
    596 /**
    597 * Sets the collator used for the language rules. User retains the ownership 
    598 * of this collator, thus the responsibility of deletion lies with the user.
    599 * This method causes internal data such as the pattern collation elements
    600 * and shift tables to be recalculated, but the iterator's position is unchanged.
    601 * @param strsrch search iterator data struct
    602 * @param collator to be used
    603 * @param status for errors if it occurs
    604 * @stable ICU 2.4
    605 */
    606 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch, 
    607                                          const UCollator     *collator,
    608                                                UErrorCode    *status);
    609 
    610 /**
    611 * Sets the pattern used for matching.
    612 * Internal data like the pattern collation elements will be recalculated, but the 
    613 * iterator's position is unchanged.
    614 *
    615 * The UStringSearch retains a pointer to the pattern string. The caller must not
    616 * modify or delete the string while using the UStringSearch.
    617 *
    618 * @param strsrch search iterator data struct
    619 * @param pattern string
    620 * @param patternlength pattern length, -1 for null-terminated string
    621 * @param status for errors if it occurs. If text is NULL, or textlength is 0 
    622 *               then an U_ILLEGAL_ARGUMENT_ERROR is returned with no change
    623 *               done to strsrch.
    624 * @stable ICU 2.4
    625 */
    626 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch, 
    627                                         const UChar         *pattern,
    628                                               int32_t        patternlength,
    629                                               UErrorCode    *status);
    630 
    631 /**
    632 * Gets the search pattern
    633 * @param strsrch search iterator data struct
    634 * @param length return length of the pattern, -1 indicates that the pattern 
    635 *               is null-terminated
    636 * @return pattern string
    637 * @stable ICU 2.4
    638 */
    639 U_CAPI const UChar * U_EXPORT2 usearch_getPattern(
    640                                               const UStringSearch *strsrch, 
    641                                                     int32_t       *length);
    642 
    643 /* methods ------------------------------------------------------------- */
    644 
    645 /**
    646 * Returns the first index at which the string text matches the search 
    647 * pattern.  
    648 * The iterator is adjusted so that its current index (as returned by 
    649 * <code>usearch_getOffset</code>) is the match position if one was found.
    650 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    651 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>.
    652 * @param strsrch search iterator data struct
    653 * @param status for errors if it occurs
    654 * @return The character index of the first match, or 
    655 * <code>USEARCH_DONE</code> if there are no matches.
    656 * @see #usearch_getOffset
    657 * @see #USEARCH_DONE
    658 * @stable ICU 2.4
    659 */
    660 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 
    661                                           UErrorCode    *status);
    662 
    663 /**
    664 * Returns the first index equal or greater than <code>position</code> at which
    665 * the string text
    666 * matches the search pattern. The iterator is adjusted so that its current 
    667 * index (as returned by <code>usearch_getOffset</code>) is the match position if 
    668 * one was found.
    669 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    670 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
    671 * <p>
    672 * Search positions that may render incorrect results are highlighted in the
    673 * header comments. If position is less than or greater than the text range 
    674 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned
    675 * @param strsrch search iterator data struct
    676 * @param position to start the search at
    677 * @param status for errors if it occurs
    678 * @return The character index of the first match following <code>pos</code>,
    679 *         or <code>USEARCH_DONE</code> if there are no matches.
    680 * @see #usearch_getOffset
    681 * @see #USEARCH_DONE
    682 * @stable ICU 2.4
    683 */
    684 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 
    685                                               int32_t    position, 
    686                                               UErrorCode    *status);
    687    
    688 /**
    689 * Returns the last index in the target text at which it matches the search 
    690 * pattern. The iterator is adjusted so that its current 
    691 * index (as returned by <code>usearch_getOffset</code>) is the match position if 
    692 * one was found.
    693 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    694 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>.
    695 * @param strsrch search iterator data struct
    696 * @param status for errors if it occurs
    697 * @return The index of the first match, or <code>USEARCH_DONE</code> if there 
    698 *         are no matches.
    699 * @see #usearch_getOffset
    700 * @see #USEARCH_DONE
    701 * @stable ICU 2.4
    702 */
    703 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 
    704                                          UErrorCode    *status);
    705 
    706 /**
    707 * Returns the first index less than <code>position</code> at which the string text 
    708 * matches the search pattern. The iterator is adjusted so that its current 
    709 * index (as returned by <code>usearch_getOffset</code>) is the match position if 
    710 * one was found.
    711 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    712 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
    713 * <p>
    714 * Search positions that may render incorrect results are highlighted in the
    715 * header comments. If position is less than or greater than the text range 
    716 * for searching, an U_INDEX_OUTOFBOUNDS_ERROR will be returned.
    717 * <p>
    718 * When <code>USEARCH_OVERLAP</code> option is off, the last index of the
    719 * result match is always less than <code>position</code>.
    720 * When <code>USERARCH_OVERLAP</code> is on, the result match may span across
    721 * <code>position</code>.
    722 * @param strsrch search iterator data struct
    723 * @param position index position the search is to begin at
    724 * @param status for errors if it occurs
    725 * @return The character index of the first match preceding <code>pos</code>,
    726 *         or <code>USEARCH_DONE</code> if there are no matches.
    727 * @see #usearch_getOffset
    728 * @see #USEARCH_DONE
    729 * @stable ICU 2.4
    730 */
    731 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 
    732                                               int32_t    position, 
    733                                               UErrorCode    *status);
    734    
    735 /**
    736 * Returns the index of the next point at which the string text matches the
    737 * search pattern, starting from the current position.
    738 * The iterator is adjusted so that its current 
    739 * index (as returned by <code>usearch_getOffset</code>) is the match position if 
    740 * one was found.
    741 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    742 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
    743 * @param strsrch search iterator data struct
    744 * @param status for errors if it occurs
    745 * @return The index of the next match after the current position, or 
    746 *         <code>USEARCH_DONE</code> if there are no more matches.
    747 * @see #usearch_first
    748 * @see #usearch_getOffset
    749 * @see #USEARCH_DONE
    750 * @stable ICU 2.4
    751 */
    752 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 
    753                                          UErrorCode    *status);
    754 
    755 /**
    756 * Returns the index of the previous point at which the string text matches
    757 * the search pattern, starting at the current position.
    758 * The iterator is adjusted so that its current 
    759 * index (as returned by <code>usearch_getOffset</code>) is the match position if 
    760 * one was found.
    761 * If a match is not found, <code>USEARCH_DONE</code> will be returned and
    762 * the iterator will be adjusted to the index <code>USEARCH_DONE</code>
    763 * @param strsrch search iterator data struct
    764 * @param status for errors if it occurs
    765 * @return The index of the previous match before the current position,
    766 *         or <code>USEARCH_DONE</code> if there are no more matches.
    767 * @see #usearch_last
    768 * @see #usearch_getOffset
    769 * @see #USEARCH_DONE
    770 * @stable ICU 2.4
    771 */
    772 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 
    773                                              UErrorCode    *status);
    774    
    775 /** 
    776 * Reset the iteration.
    777 * Search will begin at the start of the text string if a forward iteration 
    778 * is initiated before a backwards iteration. Otherwise if a backwards 
    779 * iteration is initiated before a forwards iteration, the search will begin
    780 * at the end of the text string.
    781 * @param strsrch search iterator data struct
    782 * @see #usearch_first
    783 * @stable ICU 2.4
    784 */
    785 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch);
    786 
    787 #ifndef U_HIDE_INTERNAL_API
    788 /**
    789  *  Simple forward search for the pattern, starting at a specified index,
    790  *     and using a default set search options.
    791  *
    792  *  This is an experimental function, and is not an official part of the
    793  *      ICU API.
    794  *
    795  *  The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
    796  *
    797  *  The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
    798  *  any Break Iterator are ignored.
    799  *
    800  *  Matches obey the following constraints:
    801  *
    802  *      Characters at the start or end positions of a match that are ignorable
    803  *      for collation are not included as part of the match, unless they
    804  *      are part of a combining sequence, as described below.
    805  *
    806  *      A match will not include a partial combining sequence.  Combining
    807  *      character sequences  are considered to be  inseparable units,
    808  *      and either match the pattern completely, or are considered to not match
    809  *      at all.  Thus, for example, an A followed a combining accent mark will 
    810  *      not be found when searching for a plain (unaccented) A.   (unless
    811  *      the collation strength has been set to ignore all accents).
    812  *
    813  *      When beginning a search, the initial starting position, startIdx,
    814  *      is assumed to be an acceptable match boundary with respect to
    815  *      combining characters.  A combining sequence that spans across the
    816  *      starting point will not suppress a match beginning at startIdx.
    817  *
    818  *      Characters that expand to multiple collation elements
    819  *      (German sharp-S becoming 'ss', or the composed forms of accented
    820  *      characters, for example) also must match completely.
    821  *      Searching for a single 's' in a string containing only a sharp-s will 
    822  *      find no match.
    823  *
    824  *
    825  *  @param strsrch    the UStringSearch struct, which references both
    826  *                    the text to be searched  and the pattern being sought.
    827  *  @param startIdx   The index into the text to begin the search.
    828  *  @param matchStart An out parameter, the starting index of the matched text.
    829  *                    This parameter may be NULL.
    830  *                    A value of -1 will be returned if no match was found.
    831  *  @param matchLimit Out parameter, the index of the first position following the matched text.
    832  *                    The matchLimit will be at a suitable position for beginning a subsequent search
    833  *                    in the input text.
    834  *                    This parameter may be NULL.
    835  *                    A value of -1 will be returned if no match was found.
    836  *          
    837  *  @param status     Report any errors.  Note that no match found is not an error.
    838  *  @return           true if a match was found, false otherwise.
    839  *
    840  *  @internal
    841  */
    842 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
    843                                          int32_t        startIdx,
    844                                          int32_t        *matchStart,
    845                                          int32_t        *matchLimit,
    846                                          UErrorCode     *status);
    847 
    848 /**
    849  *  Simple backwards search for the pattern, starting at a specified index,
    850  *     and using using a default set search options.
    851  *
    852  *  This is an experimental function, and is not an official part of the
    853  *      ICU API.
    854  *
    855  *  The collator options, such as UCOL_STRENGTH and UCOL_NORMALIZTION, are honored.
    856  *
    857  *  The UStringSearch options USEARCH_CANONICAL_MATCH, USEARCH_OVERLAP and
    858  *  any Break Iterator are ignored.
    859  *
    860  *  Matches obey the following constraints:
    861  *
    862  *      Characters at the start or end positions of a match that are ignorable
    863  *      for collation are not included as part of the match, unless they
    864  *      are part of a combining sequence, as described below.
    865  *
    866  *      A match will not include a partial combining sequence.  Combining
    867  *      character sequences  are considered to be  inseparable units,
    868  *      and either match the pattern completely, or are considered to not match
    869  *      at all.  Thus, for example, an A followed a combining accent mark will 
    870  *      not be found when searching for a plain (unaccented) A.   (unless
    871  *      the collation strength has been set to ignore all accents).
    872  *
    873  *      When beginning a search, the initial starting position, startIdx,
    874  *      is assumed to be an acceptable match boundary with respect to
    875  *      combining characters.  A combining sequence that spans across the
    876  *      starting point will not suppress a match beginning at startIdx.
    877  *
    878  *      Characters that expand to multiple collation elements
    879  *      (German sharp-S becoming 'ss', or the composed forms of accented
    880  *      characters, for example) also must match completely.
    881  *      Searching for a single 's' in a string containing only a sharp-s will 
    882  *      find no match.
    883  *
    884  *
    885  *  @param strsrch    the UStringSearch struct, which references both
    886  *                    the text to be searched  and the pattern being sought.
    887  *  @param startIdx   The index into the text to begin the search.
    888  *  @param matchStart An out parameter, the starting index of the matched text.
    889  *                    This parameter may be NULL.
    890  *                    A value of -1 will be returned if no match was found.
    891  *  @param matchLimit Out parameter, the index of the first position following the matched text.
    892  *                    The matchLimit will be at a suitable position for beginning a subsequent search
    893  *                    in the input text.
    894  *                    This parameter may be NULL.
    895  *                    A value of -1 will be returned if no match was found.
    896  *          
    897  *  @param status     Report any errors.  Note that no match found is not an error.
    898  *  @return           true if a match was found, false otherwise.
    899  *
    900  *  @internal
    901  */
    902 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
    903                                                   int32_t        startIdx,
    904                                                   int32_t        *matchStart,
    905                                                   int32_t        *matchLimit,
    906                                                   UErrorCode     *status);
    907 #endif  /* U_HIDE_INTERNAL_API */
    908 
    909 #endif /* #if !UCONFIG_NO_COLLATION  && !UCONFIG_NO_BREAK_ITERATION */
    910 
    911 #endif