[ tor-browser ].git.dasho

ustrcase.cpp (63950B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ustrcase.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002feb20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Implementation file for string casing C API functions.
     19 *   Uses functions from uchar.c for basic functionality that requires access
     20 *   to the Unicode Character Database (uprops.dat).
     21 */
     22 
     23 #include "unicode/utypes.h"
     24 #include "unicode/brkiter.h"
     25 #include "unicode/casemap.h"
     26 #include "unicode/edits.h"
     27 #include "unicode/stringoptions.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/ucasemap.h"
     30 #include "unicode/ubrk.h"
     31 #include "unicode/utf.h"
     32 #include "unicode/utf16.h"
     33 #include "cmemory.h"
     34 #include "ucase.h"
     35 #include "ucasemap_imp.h"
     36 #include "ustr_imp.h"
     37 #include "uassert.h"
     38 
     39 /**
     40 * Code point for COMBINING ACUTE ACCENT
     41 * @internal
     42 */
     43 #define ACUTE u'\u0301'
     44 
     45 U_NAMESPACE_BEGIN
     46 
     47 namespace {
     48 
     49 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
     50                                   Edits *edits, UErrorCode &errorCode) {
     51    if (U_SUCCESS(errorCode)) {
     52        if (destIndex > destCapacity) {
     53            errorCode = U_BUFFER_OVERFLOW_ERROR;
     54        } else if (edits != nullptr) {
     55            edits->copyErrorTo(errorCode);
     56        }
     57    }
     58    return destIndex;
     59 }
     60 
     61 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
     62 inline int32_t
     63 appendResult(char16_t *dest, int32_t destIndex, int32_t destCapacity,
     64             int32_t result, const char16_t *s,
     65             int32_t cpLength, uint32_t options, icu::Edits *edits) {
     66    UChar32 c;
     67    int32_t length;
     68 
     69    /* decode the result */
     70    if(result<0) {
     71        /* (not) original code point */
     72        if(edits!=nullptr) {
     73            edits->addUnchanged(cpLength);
     74        }
     75        if(options & U_OMIT_UNCHANGED_TEXT) {
     76            return destIndex;
     77        }
     78        c=~result;
     79        if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
     80            dest[destIndex++] = static_cast<char16_t>(c);
     81            return destIndex;
     82        }
     83        length=cpLength;
     84    } else {
     85        if(result<=UCASE_MAX_STRING_LENGTH) {
     86            c=U_SENTINEL;
     87            length=result;
     88        } else if(destIndex<destCapacity && result<=0xffff) {  // BMP slightly-fastpath
     89            dest[destIndex++] = static_cast<char16_t>(result);
     90            if(edits!=nullptr) {
     91                edits->addReplace(cpLength, 1);
     92            }
     93            return destIndex;
     94        } else {
     95            c=result;
     96            length=U16_LENGTH(c);
     97        }
     98        if(edits!=nullptr) {
     99            edits->addReplace(cpLength, length);
    100        }
    101    }
    102    if(length>(INT32_MAX-destIndex)) {
    103        return -1;  // integer overflow
    104    }
    105 
    106    if(destIndex<destCapacity) {
    107        /* append the result */
    108        if(c>=0) {
    109            /* code point */
    110            UBool isError=false;
    111            U16_APPEND(dest, destIndex, destCapacity, c, isError);
    112            if(isError) {
    113                /* overflow, nothing written */
    114                destIndex+=length;
    115            }
    116        } else {
    117            /* string */
    118            if((destIndex+length)<=destCapacity) {
    119                while(length>0) {
    120                    dest[destIndex++]=*s++;
    121                    --length;
    122                }
    123            } else {
    124                /* overflow */
    125                destIndex+=length;
    126            }
    127        }
    128    } else {
    129        /* preflight */
    130        destIndex+=length;
    131    }
    132    return destIndex;
    133 }
    134 
    135 inline int32_t
    136 appendUChar(char16_t *dest, int32_t destIndex, int32_t destCapacity, char16_t c) {
    137    if(destIndex<destCapacity) {
    138        dest[destIndex]=c;
    139    } else if(destIndex==INT32_MAX) {
    140        return -1;  // integer overflow
    141    }
    142    return destIndex+1;
    143 }
    144 
    145 int32_t
    146 appendNonEmptyUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
    147                        const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
    148    if(edits!=nullptr) {
    149        edits->addUnchanged(length);
    150    }
    151    if(options & U_OMIT_UNCHANGED_TEXT) {
    152        return destIndex;
    153    }
    154    if(length>(INT32_MAX-destIndex)) {
    155        return -1;  // integer overflow
    156    }
    157    if((destIndex+length)<=destCapacity) {
    158        u_memcpy(dest+destIndex, s, length);
    159    }
    160    return destIndex + length;
    161 }
    162 
    163 inline int32_t
    164 appendUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
    165                const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
    166    if (length <= 0) {
    167        return destIndex;
    168    }
    169    return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
    170 }
    171 
    172 UChar32 U_CALLCONV
    173 utf16_caseContextIterator(void *context, int8_t dir) {
    174    UCaseContext* csc = static_cast<UCaseContext*>(context);
    175    UChar32 c;
    176 
    177    if(dir<0) {
    178        /* reset for backward iteration */
    179        csc->index=csc->cpStart;
    180        csc->dir=dir;
    181    } else if(dir>0) {
    182        /* reset for forward iteration */
    183        csc->index=csc->cpLimit;
    184        csc->dir=dir;
    185    } else {
    186        /* continue current iteration direction */
    187        dir=csc->dir;
    188    }
    189 
    190    if(dir<0) {
    191        if(csc->start<csc->index) {
    192            U16_PREV((const char16_t *)csc->p, csc->start, csc->index, c);
    193            return c;
    194        }
    195    } else {
    196        if(csc->index<csc->limit) {
    197            U16_NEXT((const char16_t *)csc->p, csc->index, csc->limit, c);
    198            return c;
    199        }
    200    }
    201    return U_SENTINEL;
    202 }
    203 
    204 /**
    205 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
    206 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
    207 */
    208 int32_t toLower(int32_t caseLocale, uint32_t options,
    209                char16_t *dest, int32_t destCapacity,
    210                const char16_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
    211                icu::Edits *edits, UErrorCode &errorCode) {
    212    const int8_t *latinToLower;
    213    if (caseLocale == UCASE_LOC_ROOT ||
    214            (caseLocale >= 0 ?
    215                !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
    216                (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
    217        latinToLower = LatinCase::TO_LOWER_NORMAL;
    218    } else {
    219        latinToLower = LatinCase::TO_LOWER_TR_LT;
    220    }
    221    const UTrie2 *trie = ucase_getTrie();
    222    int32_t destIndex = 0;
    223    int32_t prev = srcStart;
    224    int32_t srcIndex = srcStart;
    225    for (;;) {
    226        // fast path for simple cases
    227        char16_t lead = 0;
    228        while (srcIndex < srcLimit) {
    229            lead = src[srcIndex];
    230            int32_t delta;
    231            if (lead < LatinCase::LONG_S) {
    232                int8_t d = latinToLower[lead];
    233                if (d == LatinCase::EXC) { break; }
    234                ++srcIndex;
    235                if (d == 0) { continue; }
    236                delta = d;
    237            } else if (lead >= 0xd800) {
    238                break;  // surrogate or higher
    239            } else {
    240                uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
    241                if (UCASE_HAS_EXCEPTION(props)) { break; }
    242                ++srcIndex;
    243                if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
    244                    continue;
    245                }
    246            }
    247            lead += static_cast<char16_t>(delta);
    248            destIndex = appendUnchanged(dest, destIndex, destCapacity,
    249                                        src + prev, srcIndex - 1 - prev, options, edits);
    250            if (destIndex >= 0) {
    251                destIndex = appendUChar(dest, destIndex, destCapacity, lead);
    252                if (edits != nullptr) {
    253                    edits->addReplace(1, 1);
    254                }
    255            }
    256            if (destIndex < 0) {
    257                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    258                return 0;
    259            }
    260            prev = srcIndex;
    261        }
    262        if (srcIndex >= srcLimit) {
    263            break;
    264        }
    265        // slow path
    266        int32_t cpStart = srcIndex++;
    267        char16_t trail;
    268        UChar32 c;
    269        if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
    270            c = U16_GET_SUPPLEMENTARY(lead, trail);
    271            ++srcIndex;
    272        } else {
    273            c = lead;
    274        }
    275        const char16_t *s = nullptr;
    276        if (caseLocale >= 0) {
    277            csc->cpStart = cpStart;
    278            csc->cpLimit = srcIndex;
    279            c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
    280        } else {
    281            c = ucase_toFullFolding(c, &s, options);
    282        }
    283        if (c >= 0) {
    284            destIndex = appendUnchanged(dest, destIndex, destCapacity,
    285                                        src + prev, cpStart - prev, options, edits);
    286            if (destIndex >= 0) {
    287                destIndex = appendResult(dest, destIndex, destCapacity, c, s,
    288                                         srcIndex - cpStart, options, edits);
    289            }
    290            if (destIndex < 0) {
    291                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    292                return 0;
    293            }
    294            prev = srcIndex;
    295        }
    296    }
    297    destIndex = appendUnchanged(dest, destIndex, destCapacity,
    298                                src + prev, srcIndex - prev, options, edits);
    299    if (destIndex < 0) {
    300        errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    301        return 0;
    302    }
    303    return destIndex;
    304 }
    305 
    306 int32_t toUpper(int32_t caseLocale, uint32_t options,
    307                char16_t *dest, int32_t destCapacity,
    308                const char16_t *src, UCaseContext *csc, int32_t srcLength,
    309                icu::Edits *edits, UErrorCode &errorCode) {
    310    const int8_t *latinToUpper;
    311    if (caseLocale == UCASE_LOC_TURKISH) {
    312        latinToUpper = LatinCase::TO_UPPER_TR;
    313    } else {
    314        latinToUpper = LatinCase::TO_UPPER_NORMAL;
    315    }
    316    const UTrie2 *trie = ucase_getTrie();
    317    int32_t destIndex = 0;
    318    int32_t prev = 0;
    319    int32_t srcIndex = 0;
    320    for (;;) {
    321        // fast path for simple cases
    322        char16_t lead = 0;
    323        while (srcIndex < srcLength) {
    324            lead = src[srcIndex];
    325            int32_t delta;
    326            if (lead < LatinCase::LONG_S) {
    327                int8_t d = latinToUpper[lead];
    328                if (d == LatinCase::EXC) { break; }
    329                ++srcIndex;
    330                if (d == 0) { continue; }
    331                delta = d;
    332            } else if (lead >= 0xd800) {
    333                break;  // surrogate or higher
    334            } else {
    335                uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
    336                if (UCASE_HAS_EXCEPTION(props)) { break; }
    337                ++srcIndex;
    338                if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
    339                    continue;
    340                }
    341            }
    342            lead += static_cast<char16_t>(delta);
    343            destIndex = appendUnchanged(dest, destIndex, destCapacity,
    344                                        src + prev, srcIndex - 1 - prev, options, edits);
    345            if (destIndex >= 0) {
    346                destIndex = appendUChar(dest, destIndex, destCapacity, lead);
    347                if (edits != nullptr) {
    348                    edits->addReplace(1, 1);
    349                }
    350            }
    351            if (destIndex < 0) {
    352                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    353                return 0;
    354            }
    355            prev = srcIndex;
    356        }
    357        if (srcIndex >= srcLength) {
    358            break;
    359        }
    360        // slow path
    361        int32_t cpStart;
    362        csc->cpStart = cpStart = srcIndex++;
    363        char16_t trail;
    364        UChar32 c;
    365        if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
    366            c = U16_GET_SUPPLEMENTARY(lead, trail);
    367            ++srcIndex;
    368        } else {
    369            c = lead;
    370        }
    371        csc->cpLimit = srcIndex;
    372        const char16_t *s = nullptr;
    373        c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
    374        if (c >= 0) {
    375            destIndex = appendUnchanged(dest, destIndex, destCapacity,
    376                                        src + prev, cpStart - prev, options, edits);
    377            if (destIndex >= 0) {
    378                destIndex = appendResult(dest, destIndex, destCapacity, c, s,
    379                                         srcIndex - cpStart, options, edits);
    380            }
    381            if (destIndex < 0) {
    382                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    383                return 0;
    384            }
    385            prev = srcIndex;
    386        }
    387    }
    388    destIndex = appendUnchanged(dest, destIndex, destCapacity,
    389                                src + prev, srcIndex - prev, options, edits);
    390    if (destIndex < 0) {
    391        errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    392        return 0;
    393    }
    394    return destIndex;
    395 }
    396 
    397 }  // namespace
    398 
    399 U_NAMESPACE_END
    400 
    401 U_NAMESPACE_USE
    402 
    403 #if !UCONFIG_NO_BREAK_ITERATION
    404 
    405 namespace {
    406 
    407 /**
    408 * Input: c is a letter I with or without acute accent.
    409 * start is the index in src after c, and is less than segmentLimit.
    410 * If a plain i/I is followed by a plain j/J,
    411 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
    412 * then we output accordingly.
    413 *
    414 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
    415 */
    416 int32_t maybeTitleDutchIJ(const char16_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
    417                          char16_t *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
    418                          icu::Edits *edits) {
    419    U_ASSERT(start < segmentLimit);
    420 
    421    int32_t index = start;
    422    bool withAcute = false;
    423 
    424    // If the conditions are met, then the following variables tell us what to output.
    425    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
    426    bool doTitleJ = false;  // true if the j needs to be titlecased
    427    int32_t unchanged2 = 0;  // after the j (0 or 1)
    428 
    429    // next character after the first letter
    430    char16_t c2 = src[index++];
    431 
    432    // Is the first letter an i/I with accent?
    433    if (c == u'I') {
    434        if (c2 == ACUTE) {
    435            withAcute = true;
    436            unchanged1 = 1;
    437            if (index == segmentLimit) { return start; }
    438            c2 = src[index++];
    439        }
    440    } else {  // Í
    441        withAcute = true;
    442    }
    443 
    444    // Is the next character a j/J?
    445    if (c2 == u'j') {
    446        doTitleJ = true;
    447    } else if (c2 == u'J') {
    448        ++unchanged1;
    449    } else {
    450        return start;
    451    }
    452 
    453    // A plain i/I must be followed by a plain j/J.
    454    // An i/I with acute must be followed by a j/J with acute.
    455    if (withAcute) {
    456        if (index == segmentLimit || src[index++] != ACUTE) { return start; }
    457        if (doTitleJ) {
    458            unchanged2 = 1;
    459        } else {
    460            ++unchanged1;
    461        }
    462    }
    463 
    464    // There must not be another combining mark.
    465    if (index < segmentLimit) {
    466        int32_t cp;
    467        int32_t i = index;
    468        U16_NEXT(src, i, segmentLimit, cp);
    469        uint32_t typeMask = U_GET_GC_MASK(cp);
    470        if ((typeMask & U_GC_M_MASK) != 0) {
    471            return start;
    472        }
    473    }
    474 
    475    // Output the rest of the Dutch IJ.
    476    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
    477    start += unchanged1;
    478    if (doTitleJ) {
    479        destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
    480        if (edits != nullptr) {
    481            edits->addReplace(1, 1);
    482        }
    483        ++start;
    484    }
    485    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
    486 
    487    U_ASSERT(start + unchanged2 == index);
    488    return index;
    489 }
    490 
    491 }  // namespace
    492 
    493 U_CFUNC int32_t U_CALLCONV
    494 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
    495                         char16_t *dest, int32_t destCapacity,
    496                         const char16_t *src, int32_t srcLength,
    497                         icu::Edits *edits,
    498                         UErrorCode &errorCode) {
    499    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
    500        return 0;
    501    }
    502 
    503    /* set up local variables */
    504    UCaseContext csc=UCASECONTEXT_INITIALIZER;
    505    csc.p=(void *)src;
    506    csc.limit=srcLength;
    507    int32_t destIndex=0;
    508    int32_t prev=0;
    509    bool isFirstIndex=true;
    510 
    511    /* titlecasing loop */
    512    while(prev<srcLength) {
    513        /* find next index where to titlecase */
    514        int32_t index;
    515        if(isFirstIndex) {
    516            isFirstIndex=false;
    517            index=iter->first();
    518        } else {
    519            index=iter->next();
    520        }
    521        if(index==UBRK_DONE || index>srcLength) {
    522            index=srcLength;
    523        }
    524 
    525        /*
    526         * Segment [prev..index[ into 3 parts:
    527         * a) skipped characters (copy as-is) [prev..titleStart[
    528         * b) first letter (titlecase)              [titleStart..titleLimit[
    529         * c) subsequent characters (lowercase)                 [titleLimit..index[
    530         */
    531        if(prev<index) {
    532            // Find and copy skipped characters [prev..titleStart[
    533            int32_t titleStart=prev;
    534            int32_t titleLimit=prev;
    535            UChar32 c;
    536            U16_NEXT(src, titleLimit, index, c);
    537            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
    538                // Adjust the titlecasing index to the next cased character,
    539                // or to the next letter/number/symbol/private use.
    540                // Stop with titleStart<titleLimit<=index
    541                // if there is a character to be titlecased,
    542                // or else stop with titleStart==titleLimit==index.
    543                bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
    544                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
    545                    titleStart=titleLimit;
    546                    if(titleLimit==index) {
    547                        break;
    548                    }
    549                    U16_NEXT(src, titleLimit, index, c);
    550                }
    551                if (prev < titleStart) {
    552                    destIndex=appendUnchanged(dest, destIndex, destCapacity,
    553                                              src+prev, titleStart-prev, options, edits);
    554                    if(destIndex<0) {
    555                        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    556                        return 0;
    557                    }
    558                }
    559            }
    560 
    561            if(titleStart<titleLimit) {
    562                /* titlecase c which is from [titleStart..titleLimit[ */
    563                csc.cpStart=titleStart;
    564                csc.cpLimit=titleLimit;
    565                const char16_t *s;
    566                c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
    567                destIndex=appendResult(dest, destIndex, destCapacity, c, s,
    568                                       titleLimit-titleStart, options, edits);
    569                if(destIndex<0) {
    570                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    571                    return 0;
    572                }
    573 
    574                /* Special case Dutch IJ titlecasing */
    575                if (titleStart+1 < index &&
    576                        caseLocale == UCASE_LOC_DUTCH) {
    577                    if (c < 0) {
    578                        c = ~c;
    579                    }
    580 
    581                    if (c == u'I' || c == u'Í') {
    582                        titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
    583                                                       dest, destIndex, destCapacity, options,
    584                                                       edits);
    585                    }
    586                }
    587 
    588                /* lowercase [titleLimit..index[ */
    589                if(titleLimit<index) {
    590                    if((options&U_TITLECASE_NO_LOWERCASE)==0) {
    591                        /* Normal operation: Lowercase the rest of the word. */
    592                        destIndex+=
    593                            toLower(
    594                                caseLocale, options,
    595                                (dest==nullptr) ? nullptr: dest+destIndex, destCapacity-destIndex,
    596                                src, &csc, titleLimit, index,
    597                                edits, errorCode);
    598                        if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    599                            errorCode=U_ZERO_ERROR;
    600                        }
    601                        if(U_FAILURE(errorCode)) {
    602                            return destIndex;
    603                        }
    604                    } else {
    605                        /* Optionally just copy the rest of the word unchanged. */
    606                        destIndex=appendUnchanged(dest, destIndex, destCapacity,
    607                                                  src+titleLimit, index-titleLimit, options, edits);
    608                        if(destIndex<0) {
    609                            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    610                            return 0;
    611                        }
    612                    }
    613                }
    614            }
    615        }
    616 
    617        prev=index;
    618    }
    619 
    620    return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
    621 }
    622 
    623 #endif  // !UCONFIG_NO_BREAK_ITERATION
    624 
    625 U_NAMESPACE_BEGIN
    626 namespace GreekUpper {
    627 
    628 // Data generated by prototype code, see
    629 // https://icu.unicode.org/design/case/greek-upper
    630 // TODO: Move this data into ucase.icu.
    631 static const uint16_t data0370[] = {
    632    // U+0370..03FF
    633    0x0370,
    634    0x0370,
    635    0x0372,
    636    0x0372,
    637    0,
    638    0,
    639    0x0376,
    640    0x0376,
    641    0,
    642    0,
    643    0x037A,
    644    0x03FD,
    645    0x03FE,
    646    0x03FF,
    647    0,
    648    0x037F,
    649    0,
    650    0,
    651    0,
    652    0,
    653    0,
    654    0,
    655    0x0391 | HAS_VOWEL | HAS_ACCENT,
    656    0,
    657    0x0395 | HAS_VOWEL | HAS_ACCENT,
    658    0x0397 | HAS_VOWEL | HAS_ACCENT,
    659    0x0399 | HAS_VOWEL | HAS_ACCENT,
    660    0,
    661    0x039F | HAS_VOWEL | HAS_ACCENT,
    662    0,
    663    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    664    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    665    0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    666    0x0391 | HAS_VOWEL,
    667    0x0392,
    668    0x0393,
    669    0x0394,
    670    0x0395 | HAS_VOWEL,
    671    0x0396,
    672    0x0397 | HAS_VOWEL,
    673    0x0398,
    674    0x0399 | HAS_VOWEL,
    675    0x039A,
    676    0x039B,
    677    0x039C,
    678    0x039D,
    679    0x039E,
    680    0x039F | HAS_VOWEL,
    681    0x03A0,
    682    0x03A1,
    683    0,
    684    0x03A3,
    685    0x03A4,
    686    0x03A5 | HAS_VOWEL,
    687    0x03A6,
    688    0x03A7,
    689    0x03A8,
    690    0x03A9 | HAS_VOWEL,
    691    0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
    692    0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
    693    0x0391 | HAS_VOWEL | HAS_ACCENT,
    694    0x0395 | HAS_VOWEL | HAS_ACCENT,
    695    0x0397 | HAS_VOWEL | HAS_ACCENT,
    696    0x0399 | HAS_VOWEL | HAS_ACCENT,
    697    0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    698    0x0391 | HAS_VOWEL,
    699    0x0392,
    700    0x0393,
    701    0x0394,
    702    0x0395 | HAS_VOWEL,
    703    0x0396,
    704    0x0397 | HAS_VOWEL,
    705    0x0398,
    706    0x0399 | HAS_VOWEL,
    707    0x039A,
    708    0x039B,
    709    0x039C,
    710    0x039D,
    711    0x039E,
    712    0x039F | HAS_VOWEL,
    713    0x03A0,
    714    0x03A1,
    715    0x03A3,
    716    0x03A3,
    717    0x03A4,
    718    0x03A5 | HAS_VOWEL,
    719    0x03A6,
    720    0x03A7,
    721    0x03A8,
    722    0x03A9 | HAS_VOWEL,
    723    0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
    724    0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
    725    0x039F | HAS_VOWEL | HAS_ACCENT,
    726    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    727    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    728    0x03CF,
    729    0x0392,
    730    0x0398,
    731    0x03D2,
    732    0x03D2 | HAS_ACCENT,
    733    0x03D2 | HAS_DIALYTIKA,
    734    0x03A6,
    735    0x03A0,
    736    0x03CF,
    737    0x03D8,
    738    0x03D8,
    739    0x03DA,
    740    0x03DA,
    741    0x03DC,
    742    0x03DC,
    743    0x03DE,
    744    0x03DE,
    745    0x03E0,
    746    0x03E0,
    747    0,
    748    0,
    749    0,
    750    0,
    751    0,
    752    0,
    753    0,
    754    0,
    755    0,
    756    0,
    757    0,
    758    0,
    759    0,
    760    0,
    761    0x039A,
    762    0x03A1,
    763    0x03F9,
    764    0x037F,
    765    0x03F4,
    766    0x0395 | HAS_VOWEL,
    767    0,
    768    0x03F7,
    769    0x03F7,
    770    0x03F9,
    771    0x03FA,
    772    0x03FA,
    773    0x03FC,
    774    0x03FD,
    775    0x03FE,
    776    0x03FF,
    777 };
    778 
    779 static const uint16_t data1F00[] = {
    780    // U+1F00..1FFF
    781    0x0391 | HAS_VOWEL,
    782    0x0391 | HAS_VOWEL,
    783    0x0391 | HAS_VOWEL | HAS_ACCENT,
    784    0x0391 | HAS_VOWEL | HAS_ACCENT,
    785    0x0391 | HAS_VOWEL | HAS_ACCENT,
    786    0x0391 | HAS_VOWEL | HAS_ACCENT,
    787    0x0391 | HAS_VOWEL | HAS_ACCENT,
    788    0x0391 | HAS_VOWEL | HAS_ACCENT,
    789    0x0391 | HAS_VOWEL,
    790    0x0391 | HAS_VOWEL,
    791    0x0391 | HAS_VOWEL | HAS_ACCENT,
    792    0x0391 | HAS_VOWEL | HAS_ACCENT,
    793    0x0391 | HAS_VOWEL | HAS_ACCENT,
    794    0x0391 | HAS_VOWEL | HAS_ACCENT,
    795    0x0391 | HAS_VOWEL | HAS_ACCENT,
    796    0x0391 | HAS_VOWEL | HAS_ACCENT,
    797    0x0395 | HAS_VOWEL,
    798    0x0395 | HAS_VOWEL,
    799    0x0395 | HAS_VOWEL | HAS_ACCENT,
    800    0x0395 | HAS_VOWEL | HAS_ACCENT,
    801    0x0395 | HAS_VOWEL | HAS_ACCENT,
    802    0x0395 | HAS_VOWEL | HAS_ACCENT,
    803    0,
    804    0,
    805    0x0395 | HAS_VOWEL,
    806    0x0395 | HAS_VOWEL,
    807    0x0395 | HAS_VOWEL | HAS_ACCENT,
    808    0x0395 | HAS_VOWEL | HAS_ACCENT,
    809    0x0395 | HAS_VOWEL | HAS_ACCENT,
    810    0x0395 | HAS_VOWEL | HAS_ACCENT,
    811    0,
    812    0,
    813    0x0397 | HAS_VOWEL,
    814    0x0397 | HAS_VOWEL,
    815    0x0397 | HAS_VOWEL | HAS_ACCENT,
    816    0x0397 | HAS_VOWEL | HAS_ACCENT,
    817    0x0397 | HAS_VOWEL | HAS_ACCENT,
    818    0x0397 | HAS_VOWEL | HAS_ACCENT,
    819    0x0397 | HAS_VOWEL | HAS_ACCENT,
    820    0x0397 | HAS_VOWEL | HAS_ACCENT,
    821    0x0397 | HAS_VOWEL,
    822    0x0397 | HAS_VOWEL,
    823    0x0397 | HAS_VOWEL | HAS_ACCENT,
    824    0x0397 | HAS_VOWEL | HAS_ACCENT,
    825    0x0397 | HAS_VOWEL | HAS_ACCENT,
    826    0x0397 | HAS_VOWEL | HAS_ACCENT,
    827    0x0397 | HAS_VOWEL | HAS_ACCENT,
    828    0x0397 | HAS_VOWEL | HAS_ACCENT,
    829    0x0399 | HAS_VOWEL,
    830    0x0399 | HAS_VOWEL,
    831    0x0399 | HAS_VOWEL | HAS_ACCENT,
    832    0x0399 | HAS_VOWEL | HAS_ACCENT,
    833    0x0399 | HAS_VOWEL | HAS_ACCENT,
    834    0x0399 | HAS_VOWEL | HAS_ACCENT,
    835    0x0399 | HAS_VOWEL | HAS_ACCENT,
    836    0x0399 | HAS_VOWEL | HAS_ACCENT,
    837    0x0399 | HAS_VOWEL,
    838    0x0399 | HAS_VOWEL,
    839    0x0399 | HAS_VOWEL | HAS_ACCENT,
    840    0x0399 | HAS_VOWEL | HAS_ACCENT,
    841    0x0399 | HAS_VOWEL | HAS_ACCENT,
    842    0x0399 | HAS_VOWEL | HAS_ACCENT,
    843    0x0399 | HAS_VOWEL | HAS_ACCENT,
    844    0x0399 | HAS_VOWEL | HAS_ACCENT,
    845    0x039F | HAS_VOWEL,
    846    0x039F | HAS_VOWEL,
    847    0x039F | HAS_VOWEL | HAS_ACCENT,
    848    0x039F | HAS_VOWEL | HAS_ACCENT,
    849    0x039F | HAS_VOWEL | HAS_ACCENT,
    850    0x039F | HAS_VOWEL | HAS_ACCENT,
    851    0,
    852    0,
    853    0x039F | HAS_VOWEL,
    854    0x039F | HAS_VOWEL,
    855    0x039F | HAS_VOWEL | HAS_ACCENT,
    856    0x039F | HAS_VOWEL | HAS_ACCENT,
    857    0x039F | HAS_VOWEL | HAS_ACCENT,
    858    0x039F | HAS_VOWEL | HAS_ACCENT,
    859    0,
    860    0,
    861    0x03A5 | HAS_VOWEL,
    862    0x03A5 | HAS_VOWEL,
    863    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    864    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    865    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    866    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    867    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    868    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    869    0,
    870    0x03A5 | HAS_VOWEL,
    871    0,
    872    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    873    0,
    874    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    875    0,
    876    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    877    0x03A9 | HAS_VOWEL,
    878    0x03A9 | HAS_VOWEL,
    879    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    880    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    881    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    882    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    883    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    884    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    885    0x03A9 | HAS_VOWEL,
    886    0x03A9 | HAS_VOWEL,
    887    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    888    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    889    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    890    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    891    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    892    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    893    0x0391 | HAS_VOWEL | HAS_ACCENT,
    894    0x0391 | HAS_VOWEL | HAS_ACCENT,
    895    0x0395 | HAS_VOWEL | HAS_ACCENT,
    896    0x0395 | HAS_VOWEL | HAS_ACCENT,
    897    0x0397 | HAS_VOWEL | HAS_ACCENT,
    898    0x0397 | HAS_VOWEL | HAS_ACCENT,
    899    0x0399 | HAS_VOWEL | HAS_ACCENT,
    900    0x0399 | HAS_VOWEL | HAS_ACCENT,
    901    0x039F | HAS_VOWEL | HAS_ACCENT,
    902    0x039F | HAS_VOWEL | HAS_ACCENT,
    903    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    904    0x03A5 | HAS_VOWEL | HAS_ACCENT,
    905    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    906    0x03A9 | HAS_VOWEL | HAS_ACCENT,
    907    0,
    908    0,
    909    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    910    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    911    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    912    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    913    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    914    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    915    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    916    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    917    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    918    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    919    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    920    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    921    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    922    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    923    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    924    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    925    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    926    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    927    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    928    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    929    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    930    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    931    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    932    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    933    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    934    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    935    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    936    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    937    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    938    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    939    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    940    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    941    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    942    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    943    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    944    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    945    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    946    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    947    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    948    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    949    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    950    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    951    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    952    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    953    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    954    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    955    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    956    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    957    0x0391 | HAS_VOWEL,
    958    0x0391 | HAS_VOWEL,
    959    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    960    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    961    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    962    0,
    963    0x0391 | HAS_VOWEL | HAS_ACCENT,
    964    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    965    0x0391 | HAS_VOWEL,
    966    0x0391 | HAS_VOWEL,
    967    0x0391 | HAS_VOWEL | HAS_ACCENT,
    968    0x0391 | HAS_VOWEL | HAS_ACCENT,
    969    0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    970    0,
    971    0x0399 | HAS_VOWEL,
    972    0,
    973    0,
    974    0,
    975    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    976    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    977    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    978    0,
    979    0x0397 | HAS_VOWEL | HAS_ACCENT,
    980    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    981    0x0395 | HAS_VOWEL | HAS_ACCENT,
    982    0x0395 | HAS_VOWEL | HAS_ACCENT,
    983    0x0397 | HAS_VOWEL | HAS_ACCENT,
    984    0x0397 | HAS_VOWEL | HAS_ACCENT,
    985    0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    986    0,
    987    0,
    988    0,
    989    0x0399 | HAS_VOWEL,
    990    0x0399 | HAS_VOWEL,
    991    0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    992    0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    993    0,
    994    0,
    995    0x0399 | HAS_VOWEL | HAS_ACCENT,
    996    0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    997    0x0399 | HAS_VOWEL,
    998    0x0399 | HAS_VOWEL,
    999    0x0399 | HAS_VOWEL | HAS_ACCENT,
   1000    0x0399 | HAS_VOWEL | HAS_ACCENT,
   1001    0,
   1002    0,
   1003    0,
   1004    0,
   1005    0x03A5 | HAS_VOWEL,
   1006    0x03A5 | HAS_VOWEL,
   1007    0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
   1008    0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
   1009    0x03A1,
   1010    0x03A1,
   1011    0x03A5 | HAS_VOWEL | HAS_ACCENT,
   1012    0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
   1013    0x03A5 | HAS_VOWEL,
   1014    0x03A5 | HAS_VOWEL,
   1015    0x03A5 | HAS_VOWEL | HAS_ACCENT,
   1016    0x03A5 | HAS_VOWEL | HAS_ACCENT,
   1017    0x03A1,
   1018    0,
   1019    0,
   1020    0,
   1021    0,
   1022    0,
   1023    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
   1024    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
   1025    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
   1026    0,
   1027    0x03A9 | HAS_VOWEL | HAS_ACCENT,
   1028    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
   1029    0x039F | HAS_VOWEL | HAS_ACCENT,
   1030    0x039F | HAS_VOWEL | HAS_ACCENT,
   1031    0x03A9 | HAS_VOWEL | HAS_ACCENT,
   1032    0x03A9 | HAS_VOWEL | HAS_ACCENT,
   1033    0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
   1034    0,
   1035    0,
   1036    0,
   1037 };
   1038 
   1039 // U+2126 Ohm sign
   1040 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
   1041 
   1042 uint32_t getLetterData(UChar32 c) {
   1043    if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
   1044        return 0;
   1045    } else if (c <= 0x3ff) {
   1046        return data0370[c - 0x370];
   1047    } else if (c <= 0x1fff) {
   1048        return data1F00[c - 0x1f00];
   1049    } else if (c == 0x2126) {
   1050        return data2126;
   1051    } else {
   1052        return 0;
   1053    }
   1054 }
   1055 
   1056 uint32_t getDiacriticData(UChar32 c) {
   1057    switch (c) {
   1058    case 0x0300:  // varia
   1059    case 0x0301:  // tonos = oxia
   1060    case 0x0342:  // perispomeni
   1061    case 0x0302:  // circumflex can look like perispomeni
   1062    case 0x0303:  // tilde can look like perispomeni
   1063    case 0x0311:  // inverted breve can look like perispomeni
   1064        return HAS_ACCENT;
   1065    case 0x0308:  // dialytika = diaeresis
   1066        return HAS_COMBINING_DIALYTIKA;
   1067    case 0x0344:  // dialytika tonos
   1068        return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
   1069    case 0x0345:  // ypogegrammeni = iota subscript
   1070        return HAS_YPOGEGRAMMENI;
   1071    case 0x0304:  // macron
   1072    case 0x0306:  // breve
   1073    case 0x0313:  // comma above
   1074    case 0x0314:  // reversed comma above
   1075    case 0x0343:  // koronis
   1076        return HAS_OTHER_GREEK_DIACRITIC;
   1077    default:
   1078        return 0;
   1079    }
   1080 }
   1081 
   1082 UBool isFollowedByCasedLetter(const char16_t *s, int32_t i, int32_t length) {
   1083    while (i < length) {
   1084        UChar32 c;
   1085        U16_NEXT(s, i, length, c);
   1086        int32_t type = ucase_getTypeOrIgnorable(c);
   1087        if ((type & UCASE_IGNORABLE) != 0) {
   1088            // Case-ignorable, continue with the loop.
   1089        } else if (type != UCASE_NONE) {
   1090            return true;  // Followed by cased letter.
   1091        } else {
   1092            return false;  // Uncased and not case-ignorable.
   1093        }
   1094    }
   1095    return false;  // Not followed by cased letter.
   1096 }
   1097 
   1098 /**
   1099 * Greek string uppercasing with a state machine.
   1100 * Probably simpler than a stateless function that has to figure out complex context-before
   1101 * for each character.
   1102 * TODO: Try to re-consolidate one way or another with the non-Greek function.
   1103 */
   1104 int32_t toUpper(uint32_t options,
   1105                char16_t *dest, int32_t destCapacity,
   1106                const char16_t *src, int32_t srcLength,
   1107                Edits *edits,
   1108                UErrorCode &errorCode) {
   1109    int32_t destIndex=0;
   1110    uint32_t state = 0;
   1111    for (int32_t i = 0; i < srcLength;) {
   1112        int32_t nextIndex = i;
   1113        UChar32 c;
   1114        U16_NEXT(src, nextIndex, srcLength, c);
   1115        uint32_t nextState = 0;
   1116        int32_t type = ucase_getTypeOrIgnorable(c);
   1117        if ((type & UCASE_IGNORABLE) != 0) {
   1118            // c is case-ignorable
   1119            nextState |= (state & AFTER_CASED);
   1120        } else if (type != UCASE_NONE) {
   1121            // c is cased
   1122            nextState |= AFTER_CASED;
   1123        }
   1124        uint32_t data = getLetterData(c);
   1125        if (data > 0) {
   1126            uint32_t upper = data & UPPER_MASK;
   1127            // Add a dialytika to this iota or ypsilon vowel
   1128            // if we removed a tonos from the previous vowel,
   1129            // and that previous vowel did not also have (or gain) a dialytika.
   1130            // Adding one only to the final vowel in a longer sequence
   1131            // (which does not occur in normal writing) would require lookahead.
   1132            // Set the same flag as for preserving an existing dialytika.
   1133            if ((data & HAS_VOWEL) != 0 &&
   1134                (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
   1135                    0 &&
   1136                (upper == 0x399 || upper == 0x3A5)) {
   1137                data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
   1138                                                                      : HAS_COMBINING_DIALYTIKA;
   1139            }
   1140            int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
   1141            if ((data & HAS_YPOGEGRAMMENI) != 0) {
   1142                numYpogegrammeni = 1;
   1143            }
   1144            const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
   1145            // Skip combining diacritics after this Greek letter.
   1146            while (nextIndex < srcLength) {
   1147                uint32_t diacriticData = getDiacriticData(src[nextIndex]);
   1148                if (diacriticData != 0) {
   1149                    data |= diacriticData;
   1150                    if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
   1151                        ++numYpogegrammeni;
   1152                    }
   1153                    ++nextIndex;
   1154                } else {
   1155                    break;  // not a Greek diacritic
   1156                }
   1157            }
   1158            if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
   1159                nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
   1160                                                  : AFTER_VOWEL_WITH_COMBINING_ACCENT;
   1161            }
   1162            // Map according to Greek rules.
   1163            UBool addTonos = false;
   1164            if (upper == 0x397 &&
   1165                    (data & HAS_ACCENT) != 0 &&
   1166                    numYpogegrammeni == 0 &&
   1167                    (state & AFTER_CASED) == 0 &&
   1168                    !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
   1169                // Keep disjunctive "or" with (only) a tonos.
   1170                // We use the same "word boundary" conditions as for the Final_Sigma test.
   1171                if (hasPrecomposedAccent) {
   1172                    upper = 0x389;  // Preserve the precomposed form.
   1173                } else {
   1174                    addTonos = true;
   1175                }
   1176            } else if ((data & HAS_DIALYTIKA) != 0) {
   1177                // Preserve a vowel with dialytika in precomposed form if it exists.
   1178                if (upper == 0x399) {
   1179                    upper = 0x3AA;
   1180                    data &= ~HAS_EITHER_DIALYTIKA;
   1181                } else if (upper == 0x3A5) {
   1182                    upper = 0x3AB;
   1183                    data &= ~HAS_EITHER_DIALYTIKA;
   1184                }
   1185            }
   1186 
   1187            UBool change;
   1188            if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
   1189                change = true;  // common, simple usage
   1190            } else {
   1191                // Find out first whether we are changing the text.
   1192                change = src[i] != upper || numYpogegrammeni > 0;
   1193                int32_t i2 = i + 1;
   1194                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
   1195                    change |= i2 >= nextIndex || src[i2] != 0x308;
   1196                    ++i2;
   1197                }
   1198                if (addTonos) {
   1199                    change |= i2 >= nextIndex || src[i2] != 0x301;
   1200                    ++i2;
   1201                }
   1202                int32_t oldLength = nextIndex - i;
   1203                int32_t newLength = (i2 - i) + numYpogegrammeni;
   1204                change |= oldLength != newLength;
   1205                if (change) {
   1206                    if (edits != nullptr) {
   1207                        edits->addReplace(oldLength, newLength);
   1208                    }
   1209                } else {
   1210                    if (edits != nullptr) {
   1211                        edits->addUnchanged(oldLength);
   1212                    }
   1213                    // Write unchanged text?
   1214                    change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
   1215                }
   1216            }
   1217 
   1218            if (change) {
   1219                destIndex = appendUChar(dest, destIndex, destCapacity, static_cast<char16_t>(upper));
   1220                if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
   1221                    destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
   1222                }
   1223                if (destIndex >= 0 && addTonos) {
   1224                    destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
   1225                }
   1226                while (destIndex >= 0 && numYpogegrammeni > 0) {
   1227                    destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
   1228                    --numYpogegrammeni;
   1229                }
   1230                if(destIndex<0) {
   1231                    errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1232                    return 0;
   1233                }
   1234            }
   1235        } else {
   1236            const char16_t *s;
   1237            c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
   1238            destIndex = appendResult(dest, destIndex, destCapacity, c, s,
   1239                                     nextIndex - i, options, edits);
   1240            if (destIndex < 0) {
   1241                errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
   1242                return 0;
   1243            }
   1244        }
   1245        i = nextIndex;
   1246        state = nextState;
   1247    }
   1248 
   1249    return destIndex;
   1250 }
   1251 
   1252 }  // namespace GreekUpper
   1253 U_NAMESPACE_END
   1254 
   1255 /* functions available in the common library (for unistr_case.cpp) */
   1256 
   1257 U_CFUNC int32_t U_CALLCONV
   1258 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1259                         char16_t *dest, int32_t destCapacity,
   1260                         const char16_t *src, int32_t srcLength,
   1261                         icu::Edits *edits,
   1262                         UErrorCode &errorCode) {
   1263    UCaseContext csc=UCASECONTEXT_INITIALIZER;
   1264    csc.p=(void *)src;
   1265    csc.limit=srcLength;
   1266    int32_t destIndex = toLower(
   1267        caseLocale, options,
   1268        dest, destCapacity,
   1269        src, &csc, 0, srcLength,
   1270        edits, errorCode);
   1271    return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1272 }
   1273 
   1274 U_CFUNC int32_t U_CALLCONV
   1275 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1276                         char16_t *dest, int32_t destCapacity,
   1277                         const char16_t *src, int32_t srcLength,
   1278                         icu::Edits *edits,
   1279                         UErrorCode &errorCode) {
   1280    int32_t destIndex;
   1281    if (caseLocale == UCASE_LOC_GREEK) {
   1282        destIndex = GreekUpper::toUpper(options, dest, destCapacity,
   1283                                        src, srcLength, edits, errorCode);
   1284    } else {
   1285        UCaseContext csc=UCASECONTEXT_INITIALIZER;
   1286        csc.p=(void *)src;
   1287        csc.limit=srcLength;
   1288        destIndex = toUpper(
   1289            caseLocale, options,
   1290            dest, destCapacity,
   1291            src, &csc, srcLength,
   1292            edits, errorCode);
   1293    }
   1294    return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1295 }
   1296 
   1297 U_CFUNC int32_t U_CALLCONV
   1298 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1299                      char16_t *dest, int32_t destCapacity,
   1300                      const char16_t *src, int32_t srcLength,
   1301                      icu::Edits *edits,
   1302                      UErrorCode &errorCode) {
   1303    int32_t destIndex = toLower(
   1304        -1, options,
   1305        dest, destCapacity,
   1306        src, nullptr, 0, srcLength,
   1307        edits, errorCode);
   1308    return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1309 }
   1310 
   1311 U_CFUNC int32_t
   1312 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
   1313             char16_t *dest, int32_t destCapacity,
   1314             const char16_t *src, int32_t srcLength,
   1315             UStringCaseMapper *stringCaseMapper,
   1316             icu::Edits *edits,
   1317             UErrorCode &errorCode) {
   1318    int32_t destLength;
   1319 
   1320    /* check argument values */
   1321    if(U_FAILURE(errorCode)) {
   1322        return 0;
   1323    }
   1324    if( destCapacity<0 ||
   1325        (dest==nullptr && destCapacity>0) ||
   1326        src==nullptr ||
   1327        srcLength<-1
   1328    ) {
   1329        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1330        return 0;
   1331    }
   1332 
   1333    /* get the string length */
   1334    if(srcLength==-1) {
   1335        srcLength=u_strlen(src);
   1336    }
   1337 
   1338    /* check for overlapping source and destination */
   1339    if( dest!=nullptr &&
   1340        ((src>=dest && src<(dest+destCapacity)) ||
   1341         (dest>=src && dest<(src+srcLength)))
   1342    ) {
   1343        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1344        return 0;
   1345    }
   1346 
   1347    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
   1348        edits->reset();
   1349    }
   1350    destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
   1351                                dest, destCapacity, src, srcLength, edits, errorCode);
   1352    return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
   1353 }
   1354 
   1355 U_CFUNC int32_t
   1356 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
   1357                        char16_t *dest, int32_t destCapacity,
   1358                        const char16_t *src, int32_t srcLength,
   1359                        UStringCaseMapper *stringCaseMapper,
   1360                        UErrorCode &errorCode) {
   1361    char16_t buffer[300];
   1362    char16_t *temp;
   1363 
   1364    int32_t destLength;
   1365 
   1366    /* check argument values */
   1367    if(U_FAILURE(errorCode)) {
   1368        return 0;
   1369    }
   1370    if( destCapacity<0 ||
   1371        (dest==nullptr && destCapacity>0) ||
   1372        src==nullptr ||
   1373        srcLength<-1
   1374    ) {
   1375        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1376        return 0;
   1377    }
   1378 
   1379    /* get the string length */
   1380    if(srcLength==-1) {
   1381        srcLength=u_strlen(src);
   1382    }
   1383 
   1384    /* check for overlapping source and destination */
   1385    if( dest!=nullptr &&
   1386        ((src>=dest && src<(dest+destCapacity)) ||
   1387         (dest>=src && dest<(src+srcLength)))
   1388    ) {
   1389        /* overlap: provide a temporary destination buffer and later copy the result */
   1390        if(destCapacity<=UPRV_LENGTHOF(buffer)) {
   1391            /* the stack buffer is large enough */
   1392            temp=buffer;
   1393        } else {
   1394            /* allocate a buffer */
   1395            temp=(char16_t *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
   1396            if(temp==nullptr) {
   1397                errorCode=U_MEMORY_ALLOCATION_ERROR;
   1398                return 0;
   1399            }
   1400        }
   1401    } else {
   1402        temp=dest;
   1403    }
   1404 
   1405    destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
   1406                                temp, destCapacity, src, srcLength, nullptr, errorCode);
   1407    if(temp!=dest) {
   1408        /* copy the result string to the destination buffer */
   1409        if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
   1410            u_memmove(dest, temp, destLength);
   1411        }
   1412        if(temp!=buffer) {
   1413            uprv_free(temp);
   1414        }
   1415    }
   1416 
   1417    return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
   1418 }
   1419 
   1420 /* public API functions */
   1421 
   1422 U_CAPI int32_t U_EXPORT2
   1423 u_strFoldCase(char16_t *dest, int32_t destCapacity,
   1424              const char16_t *src, int32_t srcLength,
   1425              uint32_t options,
   1426              UErrorCode *pErrorCode) {
   1427    return ustrcase_mapWithOverlap(
   1428        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1429        dest, destCapacity,
   1430        src, srcLength,
   1431        ustrcase_internalFold, *pErrorCode);
   1432 }
   1433 
   1434 U_NAMESPACE_BEGIN
   1435 
   1436 int32_t CaseMap::fold(
   1437        uint32_t options,
   1438        const char16_t *src, int32_t srcLength,
   1439        char16_t *dest, int32_t destCapacity, Edits *edits,
   1440        UErrorCode &errorCode) {
   1441    return ustrcase_map(
   1442        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1443        dest, destCapacity,
   1444        src, srcLength,
   1445        ustrcase_internalFold, edits, errorCode);
   1446 }
   1447 
   1448 U_NAMESPACE_END
   1449 
   1450 /* case-insensitive string comparisons -------------------------------------- */
   1451 
   1452 /*
   1453 * This function is a copy of unorm_cmpEquivFold() minus the parts for
   1454 * canonical equivalence.
   1455 * Keep the functions in sync, and see there for how this works.
   1456 * The duplication is for modularization:
   1457 * It makes caseless (but not canonical caseless) matches independent of
   1458 * the normalization code.
   1459 */
   1460 
   1461 /* stack element for previous-level source/decomposition pointers */
   1462 struct CmpEquivLevel {
   1463    const char16_t *start, *s, *limit;
   1464 };
   1465 typedef struct CmpEquivLevel CmpEquivLevel;
   1466 
   1467 /**
   1468 * Internal implementation code comparing string with case fold.
   1469 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
   1470 *
   1471 * @param s1            input string 1
   1472 * @param length1       length of string 1, or -1 (NUL terminated)
   1473 * @param s2            input string 2
   1474 * @param length2       length of string 2, or -1 (NUL terminated)
   1475 * @param options       compare options
   1476 * @param matchLen1     (output) length of partial prefix match in s1
   1477 * @param matchLen2     (output) length of partial prefix match in s2
   1478 * @param pErrorCode    receives error status
   1479 * @return The result of comparison
   1480 */
   1481 static int32_t _cmpFold(
   1482            const char16_t *s1, int32_t length1,
   1483            const char16_t *s2, int32_t length2,
   1484            uint32_t options,
   1485            int32_t *matchLen1, int32_t *matchLen2,
   1486            UErrorCode *pErrorCode) {
   1487    int32_t cmpRes = 0;
   1488 
   1489    /* current-level start/limit - s1/s2 as current */
   1490    const char16_t *start1, *start2, *limit1, *limit2;
   1491 
   1492    /* points to the original start address */
   1493    const char16_t *org1, *org2;
   1494 
   1495    /* points to the end of match + 1 */
   1496    const char16_t *m1, *m2;
   1497 
   1498    /* case folding variables */
   1499    const char16_t *p;
   1500    int32_t length;
   1501 
   1502    /* stacks of previous-level start/current/limit */
   1503    CmpEquivLevel stack1[2], stack2[2];
   1504 
   1505    /* case folding buffers, only use current-level start/limit */
   1506    char16_t fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
   1507 
   1508    /* track which is the current level per string */
   1509    int32_t level1, level2;
   1510 
   1511    /* current code units, and code points for lookups */
   1512    UChar32 c1, c2, cp1, cp2;
   1513 
   1514    /* no argument error checking because this itself is not an API */
   1515 
   1516    /*
   1517     * assume that at least the option U_COMPARE_IGNORE_CASE is set
   1518     * otherwise this function would have to behave exactly as uprv_strCompare()
   1519     */
   1520    if(U_FAILURE(*pErrorCode)) {
   1521        return 0;
   1522    }
   1523 
   1524    /* initialize */
   1525    if(matchLen1) {
   1526        U_ASSERT(matchLen2 !=nullptr);
   1527        *matchLen1=0;
   1528        *matchLen2=0;
   1529    }
   1530 
   1531    start1=m1=org1=s1;
   1532    if(length1==-1) {
   1533        limit1=nullptr;
   1534    } else {
   1535        limit1=s1+length1;
   1536    }
   1537 
   1538    start2=m2=org2=s2;
   1539    if(length2==-1) {
   1540        limit2=nullptr;
   1541    } else {
   1542        limit2=s2+length2;
   1543    }
   1544 
   1545    level1=level2=0;
   1546    c1=c2=-1;
   1547 
   1548    /* comparison loop */
   1549    for(;;) {
   1550        /*
   1551         * here a code unit value of -1 means "get another code unit"
   1552         * below it will mean "this source is finished"
   1553         */
   1554 
   1555        if(c1<0) {
   1556            /* get next code unit from string 1, post-increment */
   1557            for(;;) {
   1558                if(s1==limit1 || ((c1=*s1)==0 && (limit1==nullptr || (options&_STRNCMP_STYLE)))) {
   1559                    if(level1==0) {
   1560                        c1=-1;
   1561                        break;
   1562                    }
   1563                } else {
   1564                    ++s1;
   1565                    break;
   1566                }
   1567 
   1568                /* reached end of level buffer, pop one level */
   1569                do {
   1570                    --level1;
   1571                    start1=stack1[level1].start;    /*Not uninitialized*/
   1572                } while(start1==nullptr);
   1573                s1=stack1[level1].s;                /*Not uninitialized*/
   1574                limit1=stack1[level1].limit;        /*Not uninitialized*/
   1575            }
   1576        }
   1577 
   1578        if(c2<0) {
   1579            /* get next code unit from string 2, post-increment */
   1580            for(;;) {
   1581                if(s2==limit2 || ((c2=*s2)==0 && (limit2==nullptr || (options&_STRNCMP_STYLE)))) {
   1582                    if(level2==0) {
   1583                        c2=-1;
   1584                        break;
   1585                    }
   1586                } else {
   1587                    ++s2;
   1588                    break;
   1589                }
   1590 
   1591                /* reached end of level buffer, pop one level */
   1592                do {
   1593                    --level2;
   1594                    start2=stack2[level2].start;    /*Not uninitialized*/
   1595                } while(start2==nullptr);
   1596                s2=stack2[level2].s;                /*Not uninitialized*/
   1597                limit2=stack2[level2].limit;        /*Not uninitialized*/
   1598            }
   1599        }
   1600 
   1601        /*
   1602         * compare c1 and c2
   1603         * either variable c1, c2 is -1 only if the corresponding string is finished
   1604         */
   1605        if(c1==c2) {
   1606            const char16_t *next1, *next2;
   1607 
   1608            if(c1<0) {
   1609                cmpRes=0;   /* c1==c2==-1 indicating end of strings */
   1610                break;
   1611            }
   1612 
   1613            /*
   1614             * Note: Move the match positions in both strings at the same time
   1615             *      only when corresponding code point(s) in the original strings
   1616             *      are fully consumed. For example, when comparing s1="Fust" and
   1617             *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
   1618             *      the first code point in the case-folded data. But the second "s"
   1619             *      has no matching code point in s1, so this implementation returns
   1620             *      2 as the prefix match length ("Fu").
   1621             */
   1622            next1=next2=nullptr;
   1623            if(level1==0) {
   1624                next1=s1;
   1625            } else if(s1==limit1) {
   1626                /* Note: This implementation only use a single level of stack.
   1627                 *      If this code needs to be changed to use multiple levels
   1628                 *      of stacks, the code above should check if the current
   1629                 *      code is at the end of all stacks.
   1630                 */
   1631                U_ASSERT(level1==1);
   1632 
   1633                /* is s1 at the end of the current stack? */
   1634                next1=stack1[0].s;
   1635            }
   1636 
   1637            if (next1!=nullptr) {
   1638                if(level2==0) {
   1639                    next2=s2;
   1640                } else if(s2==limit2) {
   1641                    U_ASSERT(level2==1);
   1642 
   1643                    /* is s2 at the end of the current stack? */
   1644                    next2=stack2[0].s;
   1645                }
   1646                if(next2!=nullptr) {
   1647                    m1=next1;
   1648                    m2=next2;
   1649                }
   1650            }
   1651            c1=c2=-1;       /* make us fetch new code units */
   1652            continue;
   1653        } else if(c1<0) {
   1654            cmpRes=-1;      /* string 1 ends before string 2 */
   1655            break;
   1656        } else if(c2<0) {
   1657            cmpRes=1;       /* string 2 ends before string 1 */
   1658            break;
   1659        }
   1660        /* c1!=c2 && c1>=0 && c2>=0 */
   1661 
   1662        /* get complete code points for c1, c2 for lookups if either is a surrogate */
   1663        cp1=c1;
   1664        if(U_IS_SURROGATE(c1)) {
   1665            char16_t c;
   1666 
   1667            if(U_IS_SURROGATE_LEAD(c1)) {
   1668                if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
   1669                    /* advance ++s1; only below if cp1 decomposes/case-folds */
   1670                    cp1=U16_GET_SUPPLEMENTARY(c1, c);
   1671                }
   1672            } else /* isTrail(c1) */ {
   1673                if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
   1674                    cp1=U16_GET_SUPPLEMENTARY(c, c1);
   1675                }
   1676            }
   1677        }
   1678 
   1679        cp2=c2;
   1680        if(U_IS_SURROGATE(c2)) {
   1681            char16_t c;
   1682 
   1683            if(U_IS_SURROGATE_LEAD(c2)) {
   1684                if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
   1685                    /* advance ++s2; only below if cp2 decomposes/case-folds */
   1686                    cp2=U16_GET_SUPPLEMENTARY(c2, c);
   1687                }
   1688            } else /* isTrail(c2) */ {
   1689                if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
   1690                    cp2=U16_GET_SUPPLEMENTARY(c, c2);
   1691                }
   1692            }
   1693        }
   1694 
   1695        /*
   1696         * go down one level for each string
   1697         * continue with the main loop as soon as there is a real change
   1698         */
   1699 
   1700        if( level1==0 &&
   1701            (length = ucase_toFullFolding(cp1, &p, options)) >= 0
   1702        ) {
   1703            /* cp1 case-folds to the code point "length" or to p[length] */
   1704            if(U_IS_SURROGATE(c1)) {
   1705                if(U_IS_SURROGATE_LEAD(c1)) {
   1706                    /* advance beyond source surrogate pair if it case-folds */
   1707                    ++s1;
   1708                } else /* isTrail(c1) */ {
   1709                    /*
   1710                     * we got a supplementary code point when hitting its trail surrogate,
   1711                     * therefore the lead surrogate must have been the same as in the other string;
   1712                     * compare this decomposition with the lead surrogate in the other string
   1713                     * remember that this simulates bulk text replacement:
   1714                     * the decomposition would replace the entire code point
   1715                     */
   1716                    --s2;
   1717                    --m2;
   1718                    c2=*(s2-1);
   1719                }
   1720            }
   1721 
   1722            /* push current level pointers */
   1723            stack1[0].start=start1;
   1724            stack1[0].s=s1;
   1725            stack1[0].limit=limit1;
   1726            ++level1;
   1727 
   1728            /* copy the folding result to fold1[] */
   1729            if(length<=UCASE_MAX_STRING_LENGTH) {
   1730                u_memcpy(fold1, p, length);
   1731            } else {
   1732                int32_t i=0;
   1733                U16_APPEND_UNSAFE(fold1, i, length);
   1734                length=i;
   1735            }
   1736 
   1737            /* set next level pointers to case folding */
   1738            start1=s1=fold1;
   1739            limit1=fold1+length;
   1740 
   1741            /* get ready to read from decomposition, continue with loop */
   1742            c1=-1;
   1743            continue;
   1744        }
   1745 
   1746        if( level2==0 &&
   1747            (length = ucase_toFullFolding(cp2, &p, options)) >= 0
   1748        ) {
   1749            /* cp2 case-folds to the code point "length" or to p[length] */
   1750            if(U_IS_SURROGATE(c2)) {
   1751                if(U_IS_SURROGATE_LEAD(c2)) {
   1752                    /* advance beyond source surrogate pair if it case-folds */
   1753                    ++s2;
   1754                } else /* isTrail(c2) */ {
   1755                    /*
   1756                     * we got a supplementary code point when hitting its trail surrogate,
   1757                     * therefore the lead surrogate must have been the same as in the other string;
   1758                     * compare this decomposition with the lead surrogate in the other string
   1759                     * remember that this simulates bulk text replacement:
   1760                     * the decomposition would replace the entire code point
   1761                     */
   1762                    --s1;
   1763                    --m2;
   1764                    c1=*(s1-1);
   1765                }
   1766            }
   1767 
   1768            /* push current level pointers */
   1769            stack2[0].start=start2;
   1770            stack2[0].s=s2;
   1771            stack2[0].limit=limit2;
   1772            ++level2;
   1773 
   1774            /* copy the folding result to fold2[] */
   1775            if(length<=UCASE_MAX_STRING_LENGTH) {
   1776                u_memcpy(fold2, p, length);
   1777            } else {
   1778                int32_t i=0;
   1779                U16_APPEND_UNSAFE(fold2, i, length);
   1780                length=i;
   1781            }
   1782 
   1783            /* set next level pointers to case folding */
   1784            start2=s2=fold2;
   1785            limit2=fold2+length;
   1786 
   1787            /* get ready to read from decomposition, continue with loop */
   1788            c2=-1;
   1789            continue;
   1790        }
   1791 
   1792        /*
   1793         * no decomposition/case folding, max level for both sides:
   1794         * return difference result
   1795         *
   1796         * code point order comparison must not just return cp1-cp2
   1797         * because when single surrogates are present then the surrogate pairs
   1798         * that formed cp1 and cp2 may be from different string indexes
   1799         *
   1800         * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
   1801         * c1=d800 cp1=10001 c2=dc00 cp2=10000
   1802         * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
   1803         *
   1804         * therefore, use same fix-up as in ustring.c/uprv_strCompare()
   1805         * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
   1806         * so we have slightly different pointer/start/limit comparisons here
   1807         */
   1808 
   1809        if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
   1810            /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
   1811            if(
   1812                (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
   1813                (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
   1814            ) {
   1815                /* part of a surrogate pair, leave >=d800 */
   1816            } else {
   1817                /* BMP code point - may be surrogate code point - make <d800 */
   1818                c1-=0x2800;
   1819            }
   1820 
   1821            if(
   1822                (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
   1823                (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
   1824            ) {
   1825                /* part of a surrogate pair, leave >=d800 */
   1826            } else {
   1827                /* BMP code point - may be surrogate code point - make <d800 */
   1828                c2-=0x2800;
   1829            }
   1830        }
   1831 
   1832        cmpRes=c1-c2;
   1833        break;
   1834    }
   1835 
   1836    if(matchLen1) {
   1837        *matchLen1=static_cast<int32_t>(m1-org1);
   1838        *matchLen2=static_cast<int32_t>(m2-org2);
   1839    }
   1840    return cmpRes;
   1841 }
   1842 
   1843 /* internal function */
   1844 U_CFUNC int32_t
   1845 u_strcmpFold(const char16_t *s1, int32_t length1,
   1846             const char16_t *s2, int32_t length2,
   1847             uint32_t options,
   1848             UErrorCode *pErrorCode) {
   1849    return _cmpFold(s1, length1, s2, length2, options, nullptr, nullptr, pErrorCode);
   1850 }
   1851 
   1852 /* public API functions */
   1853 
   1854 U_CAPI int32_t U_EXPORT2
   1855 u_strCaseCompare(const char16_t *s1, int32_t length1,
   1856                 const char16_t *s2, int32_t length2,
   1857                 uint32_t options,
   1858                 UErrorCode *pErrorCode) {
   1859    /* argument checking */
   1860    if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
   1861        return 0;
   1862    }
   1863    if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
   1864        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1865        return 0;
   1866    }
   1867    return u_strcmpFold(s1, length1, s2, length2,
   1868                        options|U_COMPARE_IGNORE_CASE,
   1869                        pErrorCode);
   1870 }
   1871 
   1872 U_CAPI int32_t U_EXPORT2
   1873 u_strcasecmp(const char16_t *s1, const char16_t *s2, uint32_t options) {
   1874    UErrorCode errorCode=U_ZERO_ERROR;
   1875    return u_strcmpFold(s1, -1, s2, -1,
   1876                        options|U_COMPARE_IGNORE_CASE,
   1877                        &errorCode);
   1878 }
   1879 
   1880 U_CAPI int32_t U_EXPORT2
   1881 u_memcasecmp(const char16_t *s1, const char16_t *s2, int32_t length, uint32_t options) {
   1882    UErrorCode errorCode=U_ZERO_ERROR;
   1883    return u_strcmpFold(s1, length, s2, length,
   1884                        options|U_COMPARE_IGNORE_CASE,
   1885                        &errorCode);
   1886 }
   1887 
   1888 U_CAPI int32_t U_EXPORT2
   1889 u_strncasecmp(const char16_t *s1, const char16_t *s2, int32_t n, uint32_t options) {
   1890    UErrorCode errorCode=U_ZERO_ERROR;
   1891    return u_strcmpFold(s1, n, s2, n,
   1892                        options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
   1893                        &errorCode);
   1894 }
   1895 
   1896 /* internal API - detect length of shared prefix */
   1897 U_CAPI void
   1898 u_caseInsensitivePrefixMatch(const char16_t *s1, int32_t length1,
   1899                             const char16_t *s2, int32_t length2,
   1900                             uint32_t options,
   1901                             int32_t *matchLen1, int32_t *matchLen2,
   1902                             UErrorCode *pErrorCode) {
   1903    _cmpFold(s1, length1, s2, length2, options,
   1904        matchLen1, matchLen2, pErrorCode);
   1905 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE