tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucasemap_imp.h (9951B)


      1 // © 2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // ucasemap_imp.h
      5 // created: 2017feb08 Markus W. Scherer
      6 
      7 #ifndef __UCASEMAP_IMP_H__
      8 #define __UCASEMAP_IMP_H__
      9 
     10 #include "unicode/utypes.h"
     11 #include "unicode/ucasemap.h"
     12 #include "unicode/uchar.h"
     13 #include "ucase.h"
     14 
     15 /**
     16 * Bit mask for the titlecasing iterator options bit field.
     17 * Currently only 3 out of 8 values are used:
     18 * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
     19 * See stringoptions.h.
     20 * @internal
     21 */
     22 #define U_TITLECASE_ITERATOR_MASK 0xe0
     23 
     24 /**
     25 * Bit mask for the titlecasing index adjustment options bit set.
     26 * Currently two bits are defined:
     27 * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
     28 * See stringoptions.h.
     29 * @internal
     30 */
     31 #define U_TITLECASE_ADJUSTMENT_MASK 0x600
     32 
     33 /**
     34 * Internal API, used by u_strcasecmp() etc.
     35 * Compare strings case-insensitively,
     36 * in code point order or code unit order.
     37 */
     38 U_CFUNC int32_t
     39 u_strcmpFold(const UChar *s1, int32_t length1,
     40             const UChar *s2, int32_t length2,
     41             uint32_t options,
     42             UErrorCode *pErrorCode);
     43 
     44 /**
     45 * Internal API, used for detecting length of
     46 * shared prefix case-insensitively.
     47 * @param s1            input string 1
     48 * @param length1       length of string 1, or -1 (NULL terminated)
     49 * @param s2            input string 2
     50 * @param length2       length of string 2, or -1 (NULL terminated)
     51 * @param options       compare options
     52 * @param matchLen1     (output) length of partial prefix match in s1
     53 * @param matchLen2     (output) length of partial prefix match in s2
     54 * @param pErrorCode    receives error status
     55 */
     56 U_CAPI void
     57 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
     58                             const UChar *s2, int32_t length2,
     59                             uint32_t options,
     60                             int32_t *matchLen1, int32_t *matchLen2,
     61                             UErrorCode *pErrorCode);
     62 
     63 #ifdef __cplusplus
     64 
     65 U_NAMESPACE_BEGIN
     66 
     67 class BreakIterator;        // unicode/brkiter.h
     68 class ByteSink;
     69 class Locale;               // unicode/locid.h
     70 
     71 /** Returns true if the options are valid. Otherwise false, and sets an error. */
     72 inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
     73    if (U_FAILURE(errorCode)) { return false; }
     74    if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
     75        // Both options together.
     76        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     77        return false;
     78    }
     79    return true;
     80 }
     81 
     82 inline UBool ustrcase_isLNS(UChar32 c) {
     83    // Letter, number, symbol,
     84    // or a private use code point because those are typically used as letters or numbers.
     85    // Consider modifier letters only if they are cased.
     86    const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
     87    int gc = u_charType(c);
     88    return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
     89 }
     90 
     91 #if !UCONFIG_NO_BREAK_ITERATION
     92 
     93 /** Returns nullptr if error. Pass in either locale or locID, not both. */
     94 U_CFUNC
     95 BreakIterator *ustrcase_getTitleBreakIterator(
     96        const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
     97        LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
     98 
     99 #endif
    100 
    101 U_NAMESPACE_END
    102 
    103 #include "unicode/unistr.h"  // for UStringCaseMapper
    104 
    105 /*
    106 * Internal string casing functions implementing
    107 * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
    108 */
    109 
    110 struct UCaseMap : public icu::UMemory {
    111    /** Implements most of ucasemap_open(). */
    112    UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
    113    ~UCaseMap();
    114 
    115 #if !UCONFIG_NO_BREAK_ITERATION
    116    icu::BreakIterator *iter;  /* We adopt the iterator, so we own it. */
    117 #endif
    118    char locale[32];
    119    int32_t caseLocale;
    120    uint32_t options;
    121 };
    122 
    123 #if UCONFIG_NO_BREAK_ITERATION
    124 #   define UCASEMAP_BREAK_ITERATOR_PARAM
    125 #   define UCASEMAP_BREAK_ITERATOR_UNUSED
    126 #   define UCASEMAP_BREAK_ITERATOR
    127 #   define UCASEMAP_BREAK_ITERATOR_NULL
    128 #else
    129 #   define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
    130 #   define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
    131 #   define UCASEMAP_BREAK_ITERATOR iter,
    132 #   define UCASEMAP_BREAK_ITERATOR_NULL NULL,
    133 #endif
    134 
    135 U_CFUNC int32_t
    136 ustrcase_getCaseLocale(const char *locale);
    137 
    138 // TODO: swap src / dest if approved for new public api
    139 /** Implements UStringCaseMapper. */
    140 U_CFUNC int32_t U_CALLCONV
    141 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    142                         char16_t *dest, int32_t destCapacity,
    143                         const char16_t *src, int32_t srcLength,
    144                         icu::Edits *edits,
    145                         UErrorCode &errorCode);
    146 
    147 /** Implements UStringCaseMapper. */
    148 U_CFUNC int32_t U_CALLCONV
    149 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    150                         char16_t *dest, int32_t destCapacity,
    151                         const char16_t *src, int32_t srcLength,
    152                         icu::Edits *edits,
    153                         UErrorCode &errorCode);
    154 
    155 #if !UCONFIG_NO_BREAK_ITERATION
    156 
    157 /** Implements UStringCaseMapper. */
    158 U_CFUNC int32_t U_CALLCONV
    159 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
    160                         icu::BreakIterator *iter,
    161                         char16_t *dest, int32_t destCapacity,
    162                         const char16_t *src, int32_t srcLength,
    163                         icu::Edits *edits,
    164                         UErrorCode &errorCode);
    165 
    166 #endif
    167 
    168 /** Implements UStringCaseMapper. */
    169 U_CFUNC int32_t U_CALLCONV
    170 ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    171                      char16_t *dest, int32_t destCapacity,
    172                      const char16_t *src, int32_t srcLength,
    173                      icu::Edits *edits,
    174                      UErrorCode &errorCode);
    175 
    176 /**
    177 * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
    178 * Implements argument checking.
    179 */
    180 U_CFUNC int32_t
    181 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    182             char16_t *dest, int32_t destCapacity,
    183             const char16_t *src, int32_t srcLength,
    184             UStringCaseMapper *stringCaseMapper,
    185             icu::Edits *edits,
    186             UErrorCode &errorCode);
    187 
    188 /**
    189 * Common string case mapping implementation for old-fashioned u_strToXyz() functions
    190 * that allow the source string to overlap the destination buffer.
    191 * Implements argument checking and internally works with an intermediate buffer if necessary.
    192 */
    193 U_CFUNC int32_t
    194 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    195                        char16_t *dest, int32_t destCapacity,
    196                        const char16_t *src, int32_t srcLength,
    197                        UStringCaseMapper *stringCaseMapper,
    198                        UErrorCode &errorCode);
    199 
    200 /**
    201 * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
    202 * UTF-8 version of UStringCaseMapper.
    203 * All error checking must be done.
    204 * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
    205 */
    206 typedef void U_CALLCONV
    207 UTF8CaseMapper(int32_t caseLocale, uint32_t options,
    208 #if !UCONFIG_NO_BREAK_ITERATION
    209               icu::BreakIterator *iter,
    210 #endif
    211               const uint8_t *src, int32_t srcLength,
    212               icu::ByteSink &sink, icu::Edits *edits,
    213               UErrorCode &errorCode);
    214 
    215 #if !UCONFIG_NO_BREAK_ITERATION
    216 
    217 /** Implements UTF8CaseMapper. */
    218 U_CFUNC void U_CALLCONV
    219 ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
    220        icu::BreakIterator *iter,
    221        const uint8_t *src, int32_t srcLength,
    222        icu::ByteSink &sink, icu::Edits *edits,
    223        UErrorCode &errorCode);
    224 
    225 #endif
    226 
    227 void
    228 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    229                 const char *src, int32_t srcLength,
    230                 UTF8CaseMapper *stringCaseMapper,
    231                 icu::ByteSink &sink, icu::Edits *edits,
    232                 UErrorCode &errorCode);
    233 
    234 /**
    235 * Implements argument checking and buffer handling
    236 * for UTF-8 string case mapping as a common function.
    237 */
    238 int32_t
    239 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    240                 char *dest, int32_t destCapacity,
    241                 const char *src, int32_t srcLength,
    242                 UTF8CaseMapper *stringCaseMapper,
    243                 icu::Edits *edits,
    244                 UErrorCode &errorCode);
    245 
    246 U_NAMESPACE_BEGIN
    247 namespace GreekUpper {
    248 
    249 // Data bits.
    250 static const uint32_t UPPER_MASK = 0x3ff;
    251 static const uint32_t HAS_VOWEL = 0x1000;
    252 static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
    253 static const uint32_t HAS_ACCENT = 0x4000;
    254 static const uint32_t HAS_DIALYTIKA = 0x8000;
    255 // Further bits during data building and processing, not stored in the data map.
    256 static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
    257 static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
    258 
    259 static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
    260 static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
    261        HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
    262 static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
    263 
    264 // State bits.
    265 static const uint32_t AFTER_CASED = 1;
    266 static const uint32_t AFTER_VOWEL_WITH_COMBINING_ACCENT = 2;
    267 static const uint32_t AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT = 4;
    268 
    269 uint32_t getLetterData(UChar32 c);
    270 
    271 /**
    272 * Returns a non-zero value for each of the Greek combining diacritics
    273 * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
    274 * plus some perispomeni look-alikes.
    275 */
    276 uint32_t getDiacriticData(UChar32 c);
    277 
    278 }  // namespace GreekUpper
    279 U_NAMESPACE_END
    280 
    281 #endif  // __cplusplus
    282 
    283 #endif  // __UCASEMAP_IMP_H__