tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

usrchimp.h (9013B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
      6 **********************************************************************
      7 *   Date        Name        Description
      8 *  08/13/2001   synwee      Creation.
      9 **********************************************************************
     10 */
     11 #ifndef USRCHIMP_H
     12 #define USRCHIMP_H
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_COLLATION
     17 
     18 #include "unicode/normalizer2.h"
     19 #include "unicode/ucol.h"
     20 #include "unicode/ucoleitr.h"
     21 #include "unicode/ubrk.h"
     22 
     23 /* mask off anything but primary order */
     24 #define UCOL_PRIMARYORDERMASK 0xffff0000
     25 /* mask off anything but secondary order */
     26 #define UCOL_SECONDARYORDERMASK 0x0000ff00
     27 /* mask off anything but tertiary order */
     28 #define UCOL_TERTIARYORDERMASK 0x000000ff
     29 /* primary order shift */
     30 #define UCOL_PRIMARYORDERSHIFT 16
     31 /* secondary order shift */
     32 #define UCOL_SECONDARYORDERSHIFT 8
     33 
     34 #define UCOL_IGNORABLE 0
     35 
     36 /* get weights from a CE */
     37 #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
     38 #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
     39 #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
     40 
     41 #define UCOL_CONTINUATION_MARKER 0xC0
     42 
     43 #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
     44 
     45 /**
     46 * This indicates an error has occurred during processing or there are no more CEs 
     47 * to be returned.
     48 */
     49 #define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
     50 
     51 U_NAMESPACE_BEGIN
     52 
     53 class CollationElementIterator;
     54 class Collator;
     55 
     56 struct PCEI
     57 {
     58    uint64_t ce;
     59    int32_t  low;
     60    int32_t  high;
     61 };
     62 
     63 struct PCEBuffer
     64 {
     65    PCEI    defaultBuffer[16];
     66    PCEI   *buffer;
     67    int32_t bufferIndex;
     68    int32_t bufferSize;
     69 
     70    PCEBuffer();
     71    ~PCEBuffer();
     72 
     73    void  reset();
     74    UBool isEmpty() const;
     75    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
     76    const PCEI *get();
     77 };
     78 
     79 class UCollationPCE : public UMemory {
     80 private:
     81    PCEBuffer          pceBuffer;
     82    CollationElementIterator *cei;
     83    UCollationStrength strength;
     84    UBool              toShift;
     85    UBool              isShifted;
     86    uint32_t           variableTop;
     87 
     88 public:
     89    UCollationPCE(UCollationElements *elems);
     90    UCollationPCE(CollationElementIterator *iter);
     91    ~UCollationPCE();
     92 
     93    void init(UCollationElements *elems);
     94    void init(CollationElementIterator *iter);
     95 
     96    /**
     97     * Get the processed ordering priority of the next collation element in the text.
     98     * A single character may contain more than one collation element.
     99     *
    100     * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
    101     * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
    102     * @param status A pointer to an UErrorCode to receive any errors.
    103     * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 
    104     *         if an error has occurred or if the end of string has been reached
    105     */
    106    int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    107    /**
    108     * Get the processed ordering priority of the previous collation element in the text.
    109     * A single character may contain more than one collation element.
    110     *
    111     * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
    112     * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
    113     * @param status A pointer to an UErrorCode to receive any errors. Notably 
    114     *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
    115     *               buffer has been exhausted.
    116     * @return The previous collation elements ordering, otherwise returns 
    117     *         UCOL_PROCESSED_NULLORDER if an error has occurred or if the start of
    118     *         string has been reached.
    119     */
    120    int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
    121 
    122 private:
    123    void init(const Collator &coll);
    124    uint64_t processCE(uint32_t ce);
    125 };
    126 
    127 U_NAMESPACE_END
    128 
    129 #define INITIAL_ARRAY_SIZE_       256
    130 
    131 struct USearch {
    132    // required since collation element iterator does not have a getText API
    133    const UChar              *text;
    134          int32_t             textLength; // exact length
    135          UBool               isOverlap;
    136          UBool               isCanonicalMatch;
    137          int16_t             elementComparisonType;
    138          UBreakIterator     *internalBreakIter;  // internal character breakiterator, lazily created.
    139          UBreakIterator     *breakIter;          // caller provided character breakiterator
    140    // value USEARCH_DONE is the default value
    141    // if we are not at the start of the text or the end of the text, 
    142    // depending on the iteration direction and matchedIndex is USEARCH_DONE 
    143    // it means that we can't find any more matches in that particular direction
    144          int32_t             matchedIndex; 
    145          int32_t             matchedLength;
    146          UBool               isForwardSearching;
    147          UBool               reset;
    148 };
    149 
    150 struct UPattern {
    151    const UChar              *text;
    152          int32_t             textLength; // exact length
    153          // length required for backwards ce comparison
    154          int32_t             cesLength;
    155          int32_t            *ces;
    156          int32_t             cesBuffer[INITIAL_ARRAY_SIZE_];
    157          int32_t             pcesLength;
    158          int64_t            *pces;
    159          int64_t             pcesBuffer[INITIAL_ARRAY_SIZE_];
    160          UBool               hasPrefixAccents;
    161          UBool               hasSuffixAccents;
    162 };
    163 
    164 struct UStringSearch {
    165    struct USearch            *search;
    166    struct UPattern            pattern;
    167    const  UCollator          *collator;
    168    const  icu::Normalizer2   *nfd;
    169    // positions within the collation element iterator is used to determine
    170    // if we are at the start of the text.
    171           UCollationElements *textIter;
    172           icu::UCollationPCE *textProcessedIter;
    173    // utility collation element, used throughout program for temporary 
    174    // iteration.
    175           UCollationElements *utilIter;
    176           UBool               ownCollator;
    177           UCollationStrength  strength;
    178           uint32_t            ceMask;
    179           uint32_t            variableTop;
    180           UBool               toShift;
    181 };
    182 
    183 /**
    184 * Exact matches without checking for the ends for extra accents.
    185 * The match after the position within the collation element iterator is to be
    186 * found. 
    187 * After a match is found the offset in the collation element iterator will be
    188 * shifted to the start of the match.
    189 * Implementation note: 
    190 * For tertiary we can't use the collator->tertiaryMask, that is a 
    191 * preprocessed mask that takes into account case options. since we are only 
    192 * concerned with exact matches, we don't need that.
    193 * Alternate handling - since only the 16 most significant digits is only used, 
    194 * we can safely do a compare without masking if the ce is a variable, we mask 
    195 * and get only the primary values no shifting to quartenary is required since 
    196 * all primary values less than variabletop will need to be masked off anyway.
    197 * If the end character is composite and the pattern ce does not match the text 
    198 * ce, we skip it until we find a match in the end composite character or when 
    199 * it has passed the character. This is so that we can match pattern "a" with
    200 * the text "\u00e6" 
    201 * @param strsrch string search data
    202 * @param status error status if any
    203 * @return true if an exact match is found, false otherwise
    204 */
    205 U_CFUNC
    206 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
    207 
    208 /**
    209 * Canonical matches.
    210 * According to the definition, matches found here will include the whole span 
    211 * of beginning and ending accents if it overlaps that region.
    212 * @param strsrch string search data
    213 * @param status error status if any
    214 * @return true if a canonical match is found, false otherwise
    215 */
    216 U_CFUNC
    217 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
    218 
    219 /**
    220 * Gets the previous match.
    221 * Comments follows from handleNextExact
    222 * @param strsrch string search data
    223 * @param status error status if any
    224 * @return True if a exact math is found, false otherwise.
    225 */
    226 U_CFUNC
    227 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
    228 
    229 /**
    230 * Canonical matches.
    231 * According to the definition, matches found here will include the whole span 
    232 * of beginning and ending accents if it overlaps that region.
    233 * @param strsrch string search data
    234 * @param status error status if any
    235 * @return true if a canonical match is found, false otherwise
    236 */
    237 U_CFUNC
    238 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 
    239                                      UErrorCode    *status);
    240 
    241 #endif /* #if !UCONFIG_NO_COLLATION */
    242 
    243 #endif