tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ubrk.h (25021B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 * Copyright (C) 1996-2015, International Business Machines Corporation and others.
      6 * All Rights Reserved.
      7 ******************************************************************************
      8 */
      9 
     10 #ifndef UBRK_H
     11 #define UBRK_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uloc.h"
     15 #include "unicode/utext.h"
     16 
     17 #if U_SHOW_CPLUSPLUS_API
     18 #include "unicode/localpointer.h"
     19 #endif   // U_SHOW_CPLUSPLUS_API
     20 
     21 /**
     22 * A text-break iterator.
     23 *  For usage in C programs.
     24 */
     25 #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
     26 #   define UBRK_TYPEDEF_UBREAK_ITERATOR
     27    /**
     28     *  Opaque type representing an ICU Break iterator object.
     29     *  @stable ICU 2.0
     30     */
     31    typedef struct UBreakIterator UBreakIterator;
     32 #endif
     33 
     34 #if !UCONFIG_NO_BREAK_ITERATION
     35 
     36 #include "unicode/parseerr.h"
     37 
     38 /**
     39 * \file
     40 * \brief C API: BreakIterator
     41 *
     42 * <h2> BreakIterator C API </h2>
     43 *
     44 * The BreakIterator C API defines  methods for finding the location
     45 * of boundaries in text. Pointer to a UBreakIterator maintain a
     46 * current position and scan over text returning the index of characters
     47 * where boundaries occur.
     48 * <p>
     49 * Line boundary analysis determines where a text string can be broken
     50 * when line-wrapping. The mechanism correctly handles punctuation and
     51 * hyphenated words.
     52 * <p>
     53 * Note: The locale keyword "lb" can be used to modify line break
     54 * behavior according to the CSS level 3 line-break options, see
     55 * <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
     56 * "ja@lb=strict", "zh@lb=loose".
     57 * <p>
     58 * Sentence boundary analysis allows selection with correct
     59 * interpretation of periods within numbers and abbreviations, and
     60 * trailing punctuation marks such as quotation marks and parentheses.
     61 * <p>
     62 * Note: The locale keyword "ss" can be used to enable use of
     63 * segmentation suppression data (preventing breaks in English after
     64 * abbreviations such as "Mr." or "Est.", for example), as follows:
     65 * "en@ss=standard".
     66 * <p>
     67 * Word boundary analysis is used by search and replace functions, as
     68 * well as within text editing applications that allow the user to
     69 * select words with a double click. Word selection provides correct
     70 * interpretation of punctuation marks within and following
     71 * words. Characters that are not part of a word, such as symbols or
     72 * punctuation marks, have word-breaks on both sides.
     73 * <p>
     74 * Character boundary analysis identifies the boundaries of
     75 * "Extended Grapheme Clusters", which are groupings of codepoints
     76 * that should be treated as character-like units for many text operations.
     77 * Please see Unicode Standard Annex #29, Unicode Text Segmentation,
     78 * http://www.unicode.org/reports/tr29/ for additional information
     79 * on grapheme clusters and guidelines on their use.
     80 * <p>
     81 * Title boundary analysis locates all positions,
     82 * typically starts of words, that should be set to Title Case
     83 * when title casing the text.
     84 * <p>
     85 * The text boundary positions are found according to the rules
     86 * described in Unicode Standard Annex #29, Text Boundaries, and
     87 * Unicode Standard Annex #14, Line Breaking Properties.  These
     88 * are available at http://www.unicode.org/reports/tr14/ and
     89 * http://www.unicode.org/reports/tr29/.
     90 * <p>
     91 * In addition to the plain C API defined in this header file, an
     92 * object oriented C++ API with equivalent functionality is defined in the
     93 * file brkiter.h.
     94 * <p>
     95 * Code snippets illustrating the use of the Break Iterator APIs
     96 * are available in the ICU User Guide,
     97 * https://unicode-org.github.io/icu/userguide/boundaryanalysis/
     98 * and in the sample program icu/source/samples/break/break.cpp
     99 */
    100 
    101 /** The possible types of text boundaries.  @stable ICU 2.0 */
    102 typedef enum UBreakIteratorType {
    103  /** Character breaks  @stable ICU 2.0 */
    104  UBRK_CHARACTER = 0,
    105  /** Word breaks @stable ICU 2.0 */
    106  UBRK_WORD = 1,
    107  /** Line breaks @stable ICU 2.0 */
    108  UBRK_LINE = 2,
    109  /** Sentence breaks @stable ICU 2.0 */
    110  UBRK_SENTENCE = 3,
    111 
    112 #ifndef U_HIDE_DEPRECATED_API
    113  /**
    114   * Title Case breaks
    115   * The iterator created using this type locates title boundaries as described for
    116   * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
    117   * please use Word Boundary iterator.
    118   *
    119   * @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
    120   */
    121  UBRK_TITLE = 4,
    122    /**
    123     * One more than the highest normal UBreakIteratorType value.
    124     * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
    125     */
    126    UBRK_COUNT = 5
    127 #endif  // U_HIDE_DEPRECATED_API
    128 } UBreakIteratorType;
    129 
    130 /** Value indicating all text boundaries have been returned.
    131 *  @stable ICU 2.0
    132 */
    133 #define UBRK_DONE ((int32_t) -1)
    134 
    135 
    136 /**
    137 *  Enum constants for the word break tags returned by
    138 *  getRuleStatus().  A range of values is defined for each category of
    139 *  word, to allow for further subdivisions of a category in future releases.
    140 *  Applications should check for tag values falling within the range, rather
    141 *  than for single individual values.
    142 *
    143 * The numeric values of all of these constants are stable (will not change).
    144 *
    145 * @stable ICU 2.2
    146 */
    147 typedef enum UWordBreak {
    148    /** Tag value for "words" that do not fit into any of other categories.
    149     *  Includes spaces and most punctuation. */
    150    UBRK_WORD_NONE           = 0,
    151    /** Upper bound for tags for uncategorized words. */
    152    UBRK_WORD_NONE_LIMIT     = 100,
    153    /** Tag value for words that appear to be numbers, lower limit.    */
    154    UBRK_WORD_NUMBER         = 100,
    155    /** Tag value for words that appear to be numbers, upper limit.    */
    156    UBRK_WORD_NUMBER_LIMIT   = 200,
    157    /** Tag value for words that contain letters, excluding
    158     *  hiragana, katakana or ideographic characters, lower limit.    */
    159    UBRK_WORD_LETTER         = 200,
    160    /** Tag value for words containing letters, upper limit  */
    161    UBRK_WORD_LETTER_LIMIT   = 300,
    162    /** Tag value for words containing kana characters, lower limit */
    163    UBRK_WORD_KANA           = 300,
    164    /** Tag value for words containing kana characters, upper limit */
    165    UBRK_WORD_KANA_LIMIT     = 400,
    166    /** Tag value for words containing ideographic characters, lower limit */
    167    UBRK_WORD_IDEO           = 400,
    168    /** Tag value for words containing ideographic characters, upper limit */
    169    UBRK_WORD_IDEO_LIMIT     = 500
    170 } UWordBreak;
    171 
    172 /**
    173 *  Enum constants for the line break tags returned by getRuleStatus().
    174 *  A range of values is defined for each category of
    175 *  word, to allow for further subdivisions of a category in future releases.
    176 *  Applications should check for tag values falling within the range, rather
    177 *  than for single individual values.
    178 *
    179 * The numeric values of all of these constants are stable (will not change).
    180 *
    181 * @stable ICU 2.8
    182 */
    183 typedef enum ULineBreakTag {
    184    /** Tag value for soft line breaks, positions at which a line break
    185      *  is acceptable but not required                */
    186    UBRK_LINE_SOFT            = 0,
    187    /** Upper bound for soft line breaks.              */
    188    UBRK_LINE_SOFT_LIMIT      = 100,
    189    /** Tag value for a hard, or mandatory line break  */
    190    UBRK_LINE_HARD            = 100,
    191    /** Upper bound for hard line breaks.              */
    192    UBRK_LINE_HARD_LIMIT      = 200
    193 } ULineBreakTag;
    194 
    195 
    196 
    197 /**
    198 *  Enum constants for the sentence break tags returned by getRuleStatus().
    199 *  A range of values is defined for each category of
    200 *  sentence, to allow for further subdivisions of a category in future releases.
    201 *  Applications should check for tag values falling within the range, rather
    202 *  than for single individual values.
    203 *
    204 * The numeric values of all of these constants are stable (will not change).
    205 *
    206 * @stable ICU 2.8
    207 */
    208 typedef enum USentenceBreakTag {
    209    /** Tag value for for sentences  ending with a sentence terminator
    210      * ('.', '?', '!', etc.) character, possibly followed by a
    211      * hard separator (CR, LF, PS, etc.)
    212      */
    213    UBRK_SENTENCE_TERM       = 0,
    214    /** Upper bound for tags for sentences ended by sentence terminators.    */
    215    UBRK_SENTENCE_TERM_LIMIT = 100,
    216    /** Tag value for for sentences that do not contain an ending
    217      * sentence terminator ('.', '?', '!', etc.) character, but
    218      * are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
    219      */
    220    UBRK_SENTENCE_SEP        = 100,
    221    /** Upper bound for tags for sentences ended by a separator.              */
    222    UBRK_SENTENCE_SEP_LIMIT  = 200
    223    /** Tag value for a hard, or mandatory line break  */
    224 } USentenceBreakTag;
    225 
    226 
    227 /**
    228 * Open a new UBreakIterator for locating text boundaries for a specified locale.
    229 * A UBreakIterator may be used for detecting character, line, word,
    230 * and sentence breaks in text.
    231 * @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
    232 * UBRK_LINE, UBRK_SENTENCE
    233 * @param locale The locale specifying the text-breaking conventions. Note that
    234 * locale keys such as "lb" and "ss" may be used to modify text break behavior,
    235 * see general discussion of BreakIterator C API.
    236 * @param text The text to be iterated over. May be null, in which case ubrk_setText() is
    237 *        used to specify the text to be iterated.
    238 * @param textLength The number of characters in text, or -1 if null-terminated.
    239 * @param status A UErrorCode to receive any errors.
    240 * @return A UBreakIterator for the specified locale.
    241 * @see ubrk_openRules
    242 * @stable ICU 2.0
    243 */
    244 U_CAPI UBreakIterator* U_EXPORT2
    245 ubrk_open(UBreakIteratorType type,
    246      const char *locale,
    247      const UChar *text,
    248      int32_t textLength,
    249      UErrorCode *status);
    250 
    251 /**
    252 * Open a new UBreakIterator for locating text boundaries using specified breaking rules.
    253 * The rule syntax is ... (TBD)
    254 * @param rules A set of rules specifying the text breaking conventions.
    255 * @param rulesLength The number of characters in rules, or -1 if null-terminated.
    256 * @param text The text to be iterated over.  May be null, in which case ubrk_setText() is
    257 *        used to specify the text to be iterated.
    258 * @param textLength The number of characters in text, or -1 if null-terminated.
    259 * @param parseErr   Receives position and context information for any syntax errors
    260 *                   detected while parsing the rules.
    261 * @param status A UErrorCode to receive any errors.
    262 * @return A UBreakIterator for the specified rules.
    263 * @see ubrk_open
    264 * @stable ICU 2.2
    265 */
    266 U_CAPI UBreakIterator* U_EXPORT2
    267 ubrk_openRules(const UChar     *rules,
    268               int32_t         rulesLength,
    269               const UChar     *text,
    270               int32_t          textLength,
    271               UParseError     *parseErr,
    272               UErrorCode      *status);
    273 
    274 /**
    275 * Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
    276 * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
    277 * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
    278 * compatible across different major versions of ICU, nor across platforms of different
    279 * endianness or different base character set family (ASCII vs EBCDIC).
    280 * @param binaryRules A set of compiled binary rules specifying the text breaking
    281 *                    conventions. Ownership of the storage containing the compiled
    282 *                    rules remains with the caller of this function. The compiled
    283 *                    rules must not be modified or deleted during the life of the
    284 *                    break iterator.
    285 * @param rulesLength The length of binaryRules in bytes; must be >= 0.
    286 * @param text        The text to be iterated over.  May be null, in which case
    287 *                    ubrk_setText() is used to specify the text to be iterated.
    288 * @param textLength  The number of characters in text, or -1 if null-terminated.
    289 * @param status      Pointer to UErrorCode to receive any errors.
    290 * @return            UBreakIterator for the specified rules.
    291 * @see ubrk_getBinaryRules
    292 * @stable ICU 59
    293 */
    294 U_CAPI UBreakIterator* U_EXPORT2
    295 ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
    296                     const UChar *  text, int32_t textLength,
    297                     UErrorCode *   status);
    298 
    299 #ifndef U_HIDE_DEPRECATED_API
    300 
    301 /**
    302 * Thread safe cloning operation
    303 * @param bi iterator to be cloned
    304 * @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
    305 *  user allocated space for the new clone. If NULL new memory will be allocated.
    306 *  If buffer is not large enough, new memory will be allocated.
    307 *  Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
    308 * @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
    309 *  pointer to size of allocated space.
    310 *  If *pBufferSize == 0, a sufficient size for use in cloning will
    311 *  be returned ('pre-flighting')
    312 *  If *pBufferSize is not enough for a stack-based safe clone,
    313 *  new memory will be allocated.
    314 * @param status to indicate whether the operation went on smoothly or there were errors
    315 *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
    316 * if pBufferSize != NULL and any allocations were necessary
    317 * @return pointer to the new clone
    318 * @deprecated ICU 69 Use ubrk_clone() instead.
    319 */
    320 U_DEPRECATED UBreakIterator * U_EXPORT2
    321 ubrk_safeClone(
    322          const UBreakIterator *bi,
    323          void *stackBuffer,
    324          int32_t *pBufferSize,
    325          UErrorCode *status);
    326 
    327 #endif /* U_HIDE_DEPRECATED_API */
    328 
    329 /**
    330 * Thread safe cloning operation.
    331 * @param bi iterator to be cloned
    332 * @param status to indicate whether the operation went on smoothly or there were errors
    333 * @return pointer to the new clone
    334 * @stable ICU 69
    335 */
    336 U_CAPI UBreakIterator * U_EXPORT2
    337 ubrk_clone(const UBreakIterator *bi,
    338           UErrorCode *status);
    339 
    340 #ifndef U_HIDE_DEPRECATED_API
    341 
    342 /**
    343  * A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
    344  * @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
    345  */
    346 #define U_BRK_SAFECLONE_BUFFERSIZE 1
    347 
    348 #endif /* U_HIDE_DEPRECATED_API */
    349 
    350 /**
    351 * Close a UBreakIterator.
    352 * Once closed, a UBreakIterator may no longer be used.
    353 * @param bi The break iterator to close.
    354 * @stable ICU 2.0
    355 */
    356 U_CAPI void U_EXPORT2
    357 ubrk_close(UBreakIterator *bi);
    358 
    359 #if U_SHOW_CPLUSPLUS_API
    360 
    361 U_NAMESPACE_BEGIN
    362 
    363 /**
    364 * \class LocalUBreakIteratorPointer
    365 * "Smart pointer" class, closes a UBreakIterator via ubrk_close().
    366 * For most methods see the LocalPointerBase base class.
    367 *
    368 * @see LocalPointerBase
    369 * @see LocalPointer
    370 * @stable ICU 4.4
    371 */
    372 U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
    373 
    374 U_NAMESPACE_END
    375 
    376 #endif
    377 
    378 /**
    379 * Sets an existing iterator to point to a new piece of text.
    380 * The break iterator retains a pointer to the supplied text.
    381 * The caller must not modify or delete the text while the BreakIterator
    382 * retains the reference.
    383 *
    384 * @param bi The iterator to use
    385 * @param text The text to be set
    386 * @param textLength The length of the text
    387 * @param status The error code
    388 * @stable ICU 2.0
    389 */
    390 U_CAPI void U_EXPORT2
    391 ubrk_setText(UBreakIterator* bi,
    392             const UChar*    text,
    393             int32_t         textLength,
    394             UErrorCode*     status);
    395 
    396 
    397 /**
    398 * Sets an existing iterator to point to a new piece of text.
    399 *
    400 * All index positions returned by break iterator functions are
    401 * native indices from the UText. For example, when breaking UTF-8
    402 * encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
    403 * will be UTF-8 string indices, not UTF-16 positions.
    404 *
    405 * @param bi The iterator to use
    406 * @param text The text to be set.
    407 *             This function makes a shallow clone of the supplied UText.  This means
    408 *             that the caller is free to immediately close or otherwise reuse the
    409 *             UText that was passed as a parameter, but that the underlying text itself
    410 *             must not be altered while being referenced by the break iterator.
    411 * @param status The error code
    412 * @stable ICU 3.4
    413 */
    414 U_CAPI void U_EXPORT2
    415 ubrk_setUText(UBreakIterator* bi,
    416             UText*          text,
    417             UErrorCode*     status);
    418 
    419 
    420 
    421 /**
    422 * Determine the most recently-returned text boundary.
    423 *
    424 * @param bi The break iterator to use.
    425 * @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
    426 * \ref ubrk_first, or \ref ubrk_last.
    427 * @stable ICU 2.0
    428 */
    429 U_CAPI int32_t U_EXPORT2
    430 ubrk_current(const UBreakIterator *bi);
    431 
    432 /**
    433 * Advance the iterator to the boundary following the current boundary.
    434 *
    435 * @param bi The break iterator to use.
    436 * @return The character index of the next text boundary, or UBRK_DONE
    437 * if all text boundaries have been returned.
    438 * @see ubrk_previous
    439 * @stable ICU 2.0
    440 */
    441 U_CAPI int32_t U_EXPORT2
    442 ubrk_next(UBreakIterator *bi);
    443 
    444 /**
    445 * Set the iterator position to the boundary preceding the current boundary.
    446 *
    447 * @param bi The break iterator to use.
    448 * @return The character index of the preceding text boundary, or UBRK_DONE
    449 * if all text boundaries have been returned.
    450 * @see ubrk_next
    451 * @stable ICU 2.0
    452 */
    453 U_CAPI int32_t U_EXPORT2
    454 ubrk_previous(UBreakIterator *bi);
    455 
    456 /**
    457 * Set the iterator position to zero, the start of the text being scanned.
    458 * @param bi The break iterator to use.
    459 * @return The new iterator position (zero).
    460 * @see ubrk_last
    461 * @stable ICU 2.0
    462 */
    463 U_CAPI int32_t U_EXPORT2
    464 ubrk_first(UBreakIterator *bi);
    465 
    466 /**
    467 * Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
    468 * This is not the same as the last character.
    469 * @param bi The break iterator to use.
    470 * @return The character offset immediately <EM>beyond</EM> the last character in the
    471 * text being scanned.
    472 * @see ubrk_first
    473 * @stable ICU 2.0
    474 */
    475 U_CAPI int32_t U_EXPORT2
    476 ubrk_last(UBreakIterator *bi);
    477 
    478 /**
    479 * Set the iterator position to the first boundary preceding the specified offset.
    480 * The new position is always smaller than offset, or UBRK_DONE.
    481 * @param bi The break iterator to use.
    482 * @param offset The offset to begin scanning.
    483 * @return The text boundary preceding offset, or UBRK_DONE.
    484 * @see ubrk_following
    485 * @stable ICU 2.0
    486 */
    487 U_CAPI int32_t U_EXPORT2
    488 ubrk_preceding(UBreakIterator *bi,
    489           int32_t offset);
    490 
    491 /**
    492 * Advance the iterator to the first boundary following the specified offset.
    493 * The value returned is always greater than offset, or UBRK_DONE.
    494 * @param bi The break iterator to use.
    495 * @param offset The offset to begin scanning.
    496 * @return The text boundary following offset, or UBRK_DONE.
    497 * @see ubrk_preceding
    498 * @stable ICU 2.0
    499 */
    500 U_CAPI int32_t U_EXPORT2
    501 ubrk_following(UBreakIterator *bi,
    502           int32_t offset);
    503 
    504 /**
    505 * Get a locale for which text breaking information is available.
    506 * A UBreakIterator in a locale returned by this function will perform the correct
    507 * text breaking for the locale.
    508 * @param index The index of the desired locale.
    509 * @return A locale for which number text breaking information is available, or 0 if none.
    510 * @see ubrk_countAvailable
    511 * @stable ICU 2.0
    512 */
    513 U_CAPI const char* U_EXPORT2
    514 ubrk_getAvailable(int32_t index);
    515 
    516 /**
    517 * Determine how many locales have text breaking information available.
    518 * This function is most useful as determining the loop ending condition for
    519 * calls to \ref ubrk_getAvailable.
    520 * @return The number of locales for which text breaking information is available.
    521 * @see ubrk_getAvailable
    522 * @stable ICU 2.0
    523 */
    524 U_CAPI int32_t U_EXPORT2
    525 ubrk_countAvailable(void);
    526 
    527 
    528 /**
    529 * Returns true if the specified position is a boundary position.  As a side
    530 * effect, leaves the iterator pointing to the first boundary position at
    531 * or after "offset".
    532 * @param bi The break iterator to use.
    533 * @param offset the offset to check.
    534 * @return True if "offset" is a boundary position.
    535 * @stable ICU 2.0
    536 */
    537 U_CAPI  UBool U_EXPORT2
    538 ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
    539 
    540 /**
    541 * Return the status from the break rule that determined the most recently
    542 * returned break position.  The values appear in the rule source
    543 * within brackets, {123}, for example.  For rules that do not specify a
    544 * status, a default value of 0 is returned.
    545 * <p>
    546 * For word break iterators, the possible values are defined in enum UWordBreak.
    547 * @stable ICU 2.2
    548 */
    549 U_CAPI  int32_t U_EXPORT2
    550 ubrk_getRuleStatus(UBreakIterator *bi);
    551 
    552 /**
    553 * Get the statuses from the break rules that determined the most recently
    554 * returned break position.  The values appear in the rule source
    555 * within brackets, {123}, for example.  The default status value for rules
    556 * that do not explicitly provide one is zero.
    557 * <p>
    558 * For word break iterators, the possible values are defined in enum UWordBreak.
    559 * @param bi        The break iterator to use
    560 * @param fillInVec an array to be filled in with the status values.
    561 * @param capacity  the length of the supplied vector.  A length of zero causes
    562 *                  the function to return the number of status values, in the
    563 *                  normal way, without attempting to store any values.
    564 * @param status    receives error codes.
    565 * @return          The number of rule status values from rules that determined
    566 *                  the most recent boundary returned by the break iterator.
    567 * @stable ICU 3.0
    568 */
    569 U_CAPI  int32_t U_EXPORT2
    570 ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
    571 
    572 /**
    573 * Return the locale of the break iterator. You can choose between the valid and
    574 * the actual locale.
    575 * @param bi break iterator
    576 * @param type locale type (valid or actual)
    577 * @param status error code
    578 * @return locale string
    579 * @stable ICU 2.8
    580 */
    581 U_CAPI const char* U_EXPORT2
    582 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
    583 
    584 /**
    585  *  Set the subject text string upon which the break iterator is operating
    586  *  without changing any other aspect of the state.
    587  *  The new and previous text strings must have the same content.
    588  *
    589  *  This function is intended for use in environments where ICU is operating on
    590  *  strings that may move around in memory.  It provides a mechanism for notifying
    591  *  ICU that the string has been relocated, and providing a new UText to access the
    592  *  string in its new position.
    593  *
    594  *  Note that the break iterator never copies the underlying text
    595  *  of a string being processed, but always operates directly on the original text
    596  *  provided by the user. Refreshing simply drops the references to the old text
    597  *  and replaces them with references to the new.
    598  *
    599  *  Caution:  this function is normally used only by very specialized
    600  *            system-level code.   One example use case is with garbage collection
    601  *            that moves the text in memory.
    602  *
    603  * @param bi         The break iterator.
    604  * @param text       The new (moved) text string.
    605  * @param status     Receives errors detected by this function.
    606  *
    607  * @stable ICU 49
    608  */
    609 U_CAPI void U_EXPORT2
    610 ubrk_refreshUText(UBreakIterator *bi,
    611                       UText          *text,
    612                       UErrorCode     *status);
    613 
    614 
    615 /**
    616 * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
    617 * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
    618 * more quickly than using ubrk_openRules. The compiled rules are not compatible across
    619 * different major versions of ICU, nor across platforms of different endianness or
    620 * different base character set family (ASCII vs EBCDIC). Supports preflighting (with
    621 * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
    622 * the binaryRules buffer. However, whether preflighting or not, if the actual length
    623 * is greater than INT32_MAX, then the function returns 0 and sets *status to
    624 * U_INDEX_OUTOFBOUNDS_ERROR.
    625 
    626 * @param bi            The break iterator to use.
    627 * @param binaryRules   Buffer to receive the compiled binary rules; set to NULL for
    628 *                      preflighting.
    629 * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
    630 *                      preflighting. Must be >= 0.
    631 * @param status        Pointer to UErrorCode to receive any errors, such as
    632 *                      U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or
    633 *                      U_ILLEGAL_ARGUMENT_ERROR.
    634 * @return              The actual byte length of the binary rules, if <= INT32_MAX;
    635 *                      otherwise 0. If not preflighting and this is larger than
    636 *                      rulesCapacity, *status will be set to an error.
    637 * @see ubrk_openBinaryRules
    638 * @stable ICU 59
    639 */
    640 U_CAPI int32_t U_EXPORT2
    641 ubrk_getBinaryRules(UBreakIterator *bi,
    642                    uint8_t *       binaryRules, int32_t rulesCapacity,
    643                    UErrorCode *    status);
    644 
    645 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    646 
    647 #endif