[ tor-browser ].git.dasho

ucase.cpp (61666B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucase.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004aug30
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Low-level Unicode character/string case mapping code.
     19 *   Much code moved here (and modified) from uchar.c.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/unistr.h"
     24 #include "unicode/uset.h"
     25 #include "unicode/utf16.h"
     26 #include "cmemory.h"
     27 #include "uassert.h"
     28 #include "ucase.h"
     29 #include "umutex.h"
     30 #include "utrie2.h"
     31 
     32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
     33 #define INCLUDED_FROM_UCASE_CPP
     34 #include "ucase_props_data.h"
     35 
     36 /* set of property starts for UnicodeSet ------------------------------------ */
     37 
     38 static UBool U_CALLCONV
     39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     40    /* add the start code point to the USet */
     41    const USetAdder* sa = static_cast<const USetAdder*>(context);
     42    sa->add(sa->set, start);
     43    return true;
     44 }
     45 
     46 U_CFUNC void U_EXPORT2
     47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     48    if(U_FAILURE(*pErrorCode)) {
     49        return;
     50    }
     51 
     52    /* add the start code point of each same-value range of the trie */
     53    utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
     54 
     55    /* add code points with hardcoded properties, plus the ones following them */
     56 
     57    /* (none right now, see comment below) */
     58 
     59    /*
     60     * Omit code points with hardcoded specialcasing properties
     61     * because we do not build property UnicodeSets for them right now.
     62     */
     63 }
     64 
     65 /* data access primitives --------------------------------------------------- */
     66 
     67 U_CAPI const struct UCaseProps * U_EXPORT2
     68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
     69    *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
     70    *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
     71    return &ucase_props_singleton;
     72 }
     73 
     74 U_CFUNC const UTrie2 * U_EXPORT2
     75 ucase_getTrie() {
     76    return &ucase_props_singleton.trie;
     77 }
     78 
     79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
     80 
     81 /* number of bits in an 8-bit integer value */
     82 static const uint8_t flagsOffset[256]={
     83    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
     84    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     85    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     86    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     87    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     88    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     89    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     90    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     91    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     92    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     93    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     94    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     95    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     96    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     97    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     98    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
     99 };
    100 
    101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
    102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
    103 
    104 /*
    105 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
    106 *
    107 * @param excWord (in) initial exceptions word
    108 * @param idx (in) desired slot index
    109 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
    110 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
    111 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
    112 */
    113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
    114    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
    115        (pExc16)+=SLOT_OFFSET(excWord, idx); \
    116        (value)=*pExc16; \
    117    } else { \
    118        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
    119        (value)=*pExc16++; \
    120        (value)=((value)<<16)|*pExc16; \
    121    } \
    122 } UPRV_BLOCK_MACRO_END
    123 
    124 /* simple case mappings ----------------------------------------------------- */
    125 
    126 U_CAPI UChar32 U_EXPORT2
    127 ucase_tolower(UChar32 c) {
    128    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    129    if(!UCASE_HAS_EXCEPTION(props)) {
    130        if(UCASE_IS_UPPER_OR_TITLE(props)) {
    131            c+=UCASE_GET_DELTA(props);
    132        }
    133    } else {
    134        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    135        uint16_t excWord=*pe++;
    136        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
    137            int32_t delta;
    138            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
    139            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
    140        }
    141        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    142            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
    143        }
    144    }
    145    return c;
    146 }
    147 
    148 U_CAPI UChar32 U_EXPORT2
    149 ucase_toupper(UChar32 c) {
    150    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    151    if(!UCASE_HAS_EXCEPTION(props)) {
    152        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    153            c+=UCASE_GET_DELTA(props);
    154        }
    155    } else {
    156        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    157        uint16_t excWord=*pe++;
    158        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
    159            int32_t delta;
    160            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
    161            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
    162        }
    163        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    164            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
    165        }
    166    }
    167    return c;
    168 }
    169 
    170 U_CAPI UChar32 U_EXPORT2
    171 ucase_totitle(UChar32 c) {
    172    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    173    if(!UCASE_HAS_EXCEPTION(props)) {
    174        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    175            c+=UCASE_GET_DELTA(props);
    176        }
    177    } else {
    178        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    179        uint16_t excWord=*pe++;
    180        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
    181            int32_t delta;
    182            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
    183            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
    184        }
    185        int32_t idx;
    186        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    187            idx=UCASE_EXC_TITLE;
    188        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    189            idx=UCASE_EXC_UPPER;
    190        } else {
    191            return c;
    192        }
    193        GET_SLOT_VALUE(excWord, idx, pe, c);
    194    }
    195    return c;
    196 }
    197 
    198 static const char16_t iDot[2] = { 0x69, 0x307 };
    199 static const char16_t jDot[2] = { 0x6a, 0x307 };
    200 static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
    201 static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
    202 static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
    203 static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
    204 
    205 
    206 U_CFUNC void U_EXPORT2
    207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
    208    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    209    if(!UCASE_HAS_EXCEPTION(props)) {
    210        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    211            /* add the one simple case mapping, no matter what type it is */
    212            int32_t delta=UCASE_GET_DELTA(props);
    213            if(delta!=0) {
    214                sa->add(sa->set, c+delta);
    215            }
    216        }
    217    } else {
    218        /*
    219         * c has exceptions, so there may be multiple simple and/or
    220         * full case mappings. Add them all.
    221         */
    222        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    223        uint16_t excWord=*pe++;
    224        const uint16_t *pe0=pe;
    225 
    226        // Hardcode the case closure of i and its relatives and ignore the
    227        // data file data for these characters.
    228        // The Turkic dotless i and dotted I with their case mapping conditions
    229        // and case folding option make the related characters behave specially.
    230        // This code matches their closure behavior to their case folding behavior.
    231        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
    232            // These characters have Turkic case foldings. Hardcode their closure.
    233            if (c == 0x49) {
    234                // Regular i and I are in one equivalence class.
    235                sa->add(sa->set, 0x69);
    236                return;
    237            } else if (c == 0x130) {
    238                // Dotted I is in a class with <0069 0307>
    239                // (for canonical equivalence with <0049 0307>).
    240                sa->addString(sa->set, iDot, 2);
    241                return;
    242            }
    243        } else if (c == 0x69) {
    244            sa->add(sa->set, 0x49);
    245            return;
    246        } else if (c == 0x131) {
    247            // Dotless i is in a class by itself.
    248            return;
    249        }
    250 
    251        /* add all simple case mappings */
    252        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    253            if(HAS_SLOT(excWord, idx)) {
    254                pe=pe0;
    255                UChar32 mapping;
    256                GET_SLOT_VALUE(excWord, idx, pe, mapping);
    257                sa->add(sa->set, mapping);
    258            }
    259        }
    260        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
    261            pe=pe0;
    262            int32_t delta;
    263            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
    264            sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
    265        }
    266 
    267        /* get the closure string pointer & length */
    268        const char16_t *closure;
    269        int32_t closureLength;
    270        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    271            pe=pe0;
    272            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    273            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    274            closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
    275        } else {
    276            closureLength=0;
    277            closure=nullptr;
    278        }
    279 
    280        /* add the full case folding */
    281        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    282            pe=pe0;
    283            int32_t fullLength;
    284            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    285 
    286            /* start of full case mapping strings */
    287            ++pe;
    288 
    289            fullLength&=0xffff; /* bits 16 and higher are reserved */
    290 
    291            /* skip the lowercase result string */
    292            pe+=fullLength&UCASE_FULL_LOWER;
    293            fullLength>>=4;
    294 
    295            /* add the full case folding string */
    296            int32_t length=fullLength&0xf;
    297            if(length!=0) {
    298                sa->addString(sa->set, (const char16_t *)pe, length);
    299                pe+=length;
    300            }
    301 
    302            /* skip the uppercase and titlecase strings */
    303            fullLength>>=4;
    304            pe+=fullLength&0xf;
    305            fullLength>>=4;
    306            pe+=fullLength;
    307 
    308            closure=(const char16_t *)pe; /* behind full case mappings */
    309        }
    310 
    311        /* add each code point in the closure string */
    312        for(int32_t idx=0; idx<closureLength;) {
    313            UChar32 mapping;
    314            U16_NEXT_UNSAFE(closure, idx, mapping);
    315            sa->add(sa->set, mapping);
    316        }
    317    }
    318 }
    319 
    320 U_CFUNC void U_EXPORT2
    321 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
    322    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    323    if(!UCASE_HAS_EXCEPTION(props)) {
    324        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    325            /* add the one simple case mapping, no matter what type it is */
    326            int32_t delta=UCASE_GET_DELTA(props);
    327            if(delta!=0) {
    328                sa->add(sa->set, c+delta);
    329            }
    330        }
    331    } else {
    332        // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
    333        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    334        uint16_t excWord=*pe++;
    335        const uint16_t *pe0=pe;
    336 
    337        // Hardcode the case closure of i and its relatives and ignore the
    338        // data file data for these characters, like in ucase_addCaseClosure().
    339        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
    340            // These characters have Turkic case foldings. Hardcode their closure.
    341            if (c == 0x49) {
    342                // Regular i and I are in one equivalence class.
    343                sa->add(sa->set, 0x69);
    344                return;
    345            } else if (c == 0x130) {
    346                // For scf=Simple_Case_Folding, dotted I is in a class by itself.
    347                return;
    348            }
    349        } else if (c == 0x69) {
    350            sa->add(sa->set, 0x49);
    351            return;
    352        } else if (c == 0x131) {
    353            // Dotless i is in a class by itself.
    354            return;
    355        }
    356 
    357        // Add all simple case mappings.
    358        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    359            if(HAS_SLOT(excWord, idx)) {
    360                pe=pe0;
    361                UChar32 mapping;
    362                GET_SLOT_VALUE(excWord, idx, pe, mapping);
    363                sa->add(sa->set, mapping);
    364            }
    365        }
    366        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
    367            pe=pe0;
    368            int32_t delta;
    369            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
    370            UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
    371            sa->add(sa->set, mapping);
    372        }
    373 
    374        /* get the closure string pointer & length */
    375        const char16_t *closure;
    376        int32_t closureLength;
    377        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    378            pe=pe0;
    379            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    380            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    381            closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
    382        } else {
    383            closureLength=0;
    384            closure=nullptr;
    385        }
    386 
    387        // Skip the full case mappings.
    388        if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    389            pe=pe0;
    390            int32_t fullLength;
    391            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    392 
    393            /* start of full case mapping strings */
    394            ++pe;
    395 
    396            fullLength&=0xffff; /* bits 16 and higher are reserved */
    397 
    398            // Skip all 4 full case mappings.
    399            pe+=fullLength&UCASE_FULL_LOWER;
    400            fullLength>>=4;
    401            pe+=fullLength&0xf;
    402            fullLength>>=4;
    403            pe+=fullLength&0xf;
    404            fullLength>>=4;
    405            pe+=fullLength;
    406 
    407            closure=(const char16_t *)pe; /* behind full case mappings */
    408        }
    409 
    410        // Add each code point in the closure string whose scf maps back to c.
    411        for(int32_t idx=0; idx<closureLength;) {
    412            UChar32 mapping;
    413            U16_NEXT_UNSAFE(closure, idx, mapping);
    414            sa->add(sa->set, mapping);
    415        }
    416    }
    417 }
    418 
    419 /*
    420 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
    421 * must be length>0 and max>0 and length<=max
    422 */
    423 static inline int32_t
    424 strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
    425    int32_t c1, c2;
    426 
    427    max-=length; /* we require length<=max, so no need to decrement max in the loop */
    428    do {
    429        c1=*s++;
    430        c2=*t++;
    431        if(c2==0) {
    432            return 1; /* reached the end of t but not of s */
    433        }
    434        c1-=c2;
    435        if(c1!=0) {
    436            return c1; /* return difference result */
    437        }
    438    } while(--length>0);
    439    /* ends with length==0 */
    440 
    441    if(max==0 || *t==0) {
    442        return 0; /* equal to length of both strings */
    443    } else {
    444        return -max; /* return length difference */
    445    }
    446 }
    447 
    448 U_CFUNC UBool U_EXPORT2
    449 ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
    450    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    451 
    452    if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
    453        return false; /* no reverse case folding data, or no string */
    454    }
    455    if(length<=1) {
    456        /* the string is too short to find any match */
    457        /*
    458         * more precise would be:
    459         * if(!u_strHasMoreChar32Than(s, length, 1))
    460         * but this does not make much practical difference because
    461         * a single supplementary code point would just not be found
    462         */
    463        return false;
    464    }
    465 
    466    const uint16_t *unfold=ucase_props_singleton.unfold;
    467    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
    468    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
    469    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
    470    unfold+=unfoldRowWidth;
    471 
    472    if(length>unfoldStringWidth) {
    473        /* the string is too long to find any match */
    474        return false;
    475    }
    476 
    477    /* do a binary search for the string */
    478    start=0;
    479    limit=unfoldRows;
    480    while(start<limit) {
    481        i=(start+limit)/2;
    482        const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
    483        result=strcmpMax(s, length, p, unfoldStringWidth);
    484 
    485        if(result==0) {
    486            /* found the string: add each code point, and its case closure */
    487            UChar32 c;
    488 
    489            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
    490                U16_NEXT_UNSAFE(p, i, c);
    491                sa->add(sa->set, c);
    492                ucase_addCaseClosure(c, sa);
    493            }
    494            return true;
    495        } else if(result<0) {
    496            limit=i;
    497        } else /* result>0 */ {
    498            start=i+1;
    499        }
    500    }
    501 
    502    return false; /* string not found */
    503 }
    504 
    505 U_NAMESPACE_BEGIN
    506 
    507 FullCaseFoldingIterator::FullCaseFoldingIterator()
    508        : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
    509          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
    510          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
    511          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
    512          currentRow(0),
    513          rowCpIndex(unfoldStringWidth) {
    514    unfold+=unfoldRowWidth;
    515 }
    516 
    517 UChar32
    518 FullCaseFoldingIterator::next(UnicodeString &full) {
    519    // Advance past the last-delivered code point.
    520    const char16_t *p=unfold+(currentRow*unfoldRowWidth);
    521    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
    522        ++currentRow;
    523        p+=unfoldRowWidth;
    524        rowCpIndex=unfoldStringWidth;
    525    }
    526    if(currentRow>=unfoldRows) { return U_SENTINEL; }
    527    // Set "full" to the NUL-terminated string in the first unfold column.
    528    int32_t length=unfoldStringWidth;
    529    while(length>0 && p[length-1]==0) { --length; }
    530    full.setTo(false, p, length);
    531    // Return the code point.
    532    UChar32 c;
    533    U16_NEXT_UNSAFE(p, rowCpIndex, c);
    534    return c;
    535 }
    536 
    537 namespace LatinCase {
    538 
    539 const int8_t TO_LOWER_NORMAL[LIMIT] = {
    540    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    541    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    542    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    543    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    544 
    545    0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
    546    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
    547    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    548    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    549 
    550    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    551    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    552    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    553    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    554 
    555    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
    556    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
    557    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    558    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    559 
    560    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    561    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    562    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    563    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
    564 
    565    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
    566    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    567    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    568    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
    569 };
    570 
    571 const int8_t TO_LOWER_TR_LT[LIMIT] = {
    572    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    573    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    574    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    575    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    576 
    577    0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
    578    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
    579    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    580    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    581 
    582    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    583    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    584    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    585    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    586 
    587    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
    588    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
    589    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    590    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    591 
    592    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    593    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    594    1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
    595    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
    596 
    597    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
    598    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    599    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    600    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
    601 };
    602 
    603 const int8_t TO_UPPER_NORMAL[LIMIT] = {
    604    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    605    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    606    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    607    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    608 
    609    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    610    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    611    0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
    612    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
    613 
    614    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    615    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    616    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    617    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    618 
    619    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    620    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
    621    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
    622    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
    623 
    624    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    625    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    626    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    627    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
    628 
    629    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
    630    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    631    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    632    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
    633 };
    634 
    635 const int8_t TO_UPPER_TR[LIMIT] = {
    636    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    637    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    638    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    639    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    640 
    641    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    642    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    643    0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
    644    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
    645 
    646    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    647    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    648    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    649    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    650 
    651    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    652    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
    653    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
    654    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
    655 
    656    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    657    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    658    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    659    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
    660 
    661    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
    662    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    663    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
    664    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
    665 };
    666 
    667 }  // namespace LatinCase
    668 
    669 U_NAMESPACE_END
    670 
    671 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    672 U_CAPI int32_t U_EXPORT2
    673 ucase_getType(UChar32 c) {
    674    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    675    return UCASE_GET_TYPE(props);
    676 }
    677 
    678 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
    679 U_CAPI int32_t U_EXPORT2
    680 ucase_getTypeOrIgnorable(UChar32 c) {
    681    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    682    return UCASE_GET_TYPE_AND_IGNORABLE(props);
    683 }
    684 
    685 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
    686 static inline int32_t
    687 getDotType(UChar32 c) {
    688    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    689    if(!UCASE_HAS_EXCEPTION(props)) {
    690        return props&UCASE_DOT_MASK;
    691    } else {
    692        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    693        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
    694    }
    695 }
    696 
    697 U_CAPI UBool U_EXPORT2
    698 ucase_isSoftDotted(UChar32 c) {
    699    return getDotType(c)==UCASE_SOFT_DOTTED;
    700 }
    701 
    702 U_CAPI UBool U_EXPORT2
    703 ucase_isCaseSensitive(UChar32 c) {
    704    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    705    if(!UCASE_HAS_EXCEPTION(props)) {
    706        return (props&UCASE_SENSITIVE)!=0;
    707    } else {
    708        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    709        return (*pe&UCASE_EXC_SENSITIVE)!=0;
    710    }
    711 }
    712 
    713 /* string casing ------------------------------------------------------------ */
    714 
    715 /*
    716 * These internal functions form the core of string case mappings.
    717 * They map single code points to result code points or strings and take
    718 * all necessary conditions (context, locale ID, options) into account.
    719 *
    720 * They do not iterate over the source or write to the destination
    721 * so that the same functions are useful for non-standard string storage,
    722 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    723 * For the same reason, the "surrounding text" context is passed in as a
    724 * UCaseContextIterator which does not make any assumptions about
    725 * the underlying storage.
    726 *
    727 * This section contains helper functions that check for conditions
    728 * in the input text surrounding the current code point
    729 * according to SpecialCasing.txt.
    730 *
    731 * Each helper function gets the index
    732 * - after the current code point if it looks at following text
    733 * - before the current code point if it looks at preceding text
    734 *
    735 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    736 *
    737 * Final_Sigma
    738 *   C is preceded by a sequence consisting of
    739 *     a cased letter and a case-ignorable sequence,
    740 *   and C is not followed by a sequence consisting of
    741 *     an ignorable sequence and then a cased letter.
    742 *
    743 * More_Above
    744 *   C is followed by one or more characters of combining class 230 (ABOVE)
    745 *   in the combining character sequence.
    746 *
    747 * After_Soft_Dotted
    748 *   The last preceding character with combining class of zero before C
    749 *   was Soft_Dotted,
    750 *   and there is no intervening combining character class 230 (ABOVE).
    751 *
    752 * Before_Dot
    753 *   C is followed by combining dot above (U+0307).
    754 *   Any sequence of characters with a combining class that is neither 0 nor 230
    755 *   may intervene between the current character and the combining dot above.
    756 *
    757 * The erratum from 2002-10-31 adds the condition
    758 *
    759 * After_I
    760 *   The last preceding base character was an uppercase I, and there is no
    761 *   intervening combining character class 230 (ABOVE).
    762 *
    763 *   (See Jitterbug 2344 and the comments on After_I below.)
    764 *
    765 * Helper definitions in Unicode 3.2 UAX 21:
    766 *
    767 * D1. A character C is defined to be cased
    768 *     if it meets any of the following criteria:
    769 *
    770 *   - The general category of C is Titlecase Letter (Lt)
    771 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    772 *   - Given D = NFD(C), then it is not the case that:
    773 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    774 *     (This third criterion does not add any characters to the list
    775 *      for Unicode 3.2. Ignored.)
    776 *
    777 * D2. A character C is defined to be case-ignorable
    778 *     if it meets either of the following criteria:
    779 *
    780 *   - The general category of C is
    781 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    782 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    783 *   - C is one of the following characters
    784 *     U+0027 APOSTROPHE
    785 *     U+00AD SOFT HYPHEN (SHY)
    786 *     U+2019 RIGHT SINGLE QUOTATION MARK
    787 *            (the preferred character for apostrophe)
    788 *
    789 * D3. A case-ignorable sequence is a sequence of
    790 *     zero or more case-ignorable characters.
    791 */
    792 
    793 #define is_d(c) ((c)=='d' || (c)=='D')
    794 #define is_e(c) ((c)=='e' || (c)=='E')
    795 #define is_i(c) ((c)=='i' || (c)=='I')
    796 #define is_l(c) ((c)=='l' || (c)=='L')
    797 #define is_r(c) ((c)=='r' || (c)=='R')
    798 #define is_t(c) ((c)=='t' || (c)=='T')
    799 #define is_u(c) ((c)=='u' || (c)=='U')
    800 #define is_y(c) ((c)=='y' || (c)=='Y')
    801 #define is_z(c) ((c)=='z' || (c)=='Z')
    802 
    803 /* separator? */
    804 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
    805 
    806 /**
    807 * Requires non-nullptr locale ID but otherwise does the equivalent of
    808 * checking for language codes as if uloc_getLanguage() were called:
    809 * Accepts both 2- and 3-letter codes and accepts case variants.
    810 */
    811 U_CFUNC int32_t
    812 ucase_getCaseLocale(const char *locale) {
    813    /*
    814     * This function used to use uloc_getLanguage(), but the current code
    815     * removes the dependency of this low-level code on uloc implementation code
    816     * and is faster because not the whole locale ID has to be
    817     * examined and copied/transformed.
    818     *
    819     * Because this code does not want to depend on uloc, the caller must
    820     * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
    821     */
    822    char c=*locale++;
    823    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
    824    // and for Chinese "zh": Very common but no special case mapping behavior.
    825    // Then check lowercase vs. uppercase to reduce the number of comparisons
    826    // for other locales without special behavior.
    827    if(c=='e') {
    828        /* el or ell? */
    829        c=*locale++;
    830        if(is_l(c)) {
    831            c=*locale++;
    832            if(is_l(c)) {
    833                c=*locale;
    834            }
    835            if(is_sep(c)) {
    836                return UCASE_LOC_GREEK;
    837            }
    838        }
    839        // en, es, ... -> root
    840    } else if(c=='z') {
    841        return UCASE_LOC_ROOT;
    842 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    843    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
    844 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
    845    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
    846 #else
    847 #   error Unknown charset family!
    848 #endif
    849        // lowercase c
    850        if(c=='t') {
    851            /* tr or tur? */
    852            c=*locale++;
    853            if(is_u(c)) {
    854                c=*locale++;
    855            }
    856            if(is_r(c)) {
    857                c=*locale;
    858                if(is_sep(c)) {
    859                    return UCASE_LOC_TURKISH;
    860                }
    861            }
    862        } else if(c=='a') {
    863            /* az or aze? */
    864            c=*locale++;
    865            if(is_z(c)) {
    866                c=*locale++;
    867                if(is_e(c)) {
    868                    c=*locale;
    869                }
    870                if(is_sep(c)) {
    871                    return UCASE_LOC_TURKISH;
    872                }
    873            }
    874        } else if(c=='l') {
    875            /* lt or lit? */
    876            c=*locale++;
    877            if(is_i(c)) {
    878                c=*locale++;
    879            }
    880            if(is_t(c)) {
    881                c=*locale;
    882                if(is_sep(c)) {
    883                    return UCASE_LOC_LITHUANIAN;
    884                }
    885            }
    886        } else if(c=='n') {
    887            /* nl or nld? */
    888            c=*locale++;
    889            if(is_l(c)) {
    890                c=*locale++;
    891                if(is_d(c)) {
    892                    c=*locale;
    893                }
    894                if(is_sep(c)) {
    895                    return UCASE_LOC_DUTCH;
    896                }
    897            }
    898        } else if(c=='h') {
    899            /* hy or hye? *not* hyw */
    900            c=*locale++;
    901            if(is_y(c)) {
    902                c=*locale++;
    903                if(is_e(c)) {
    904                    c=*locale;
    905                }
    906                if(is_sep(c)) {
    907                    return UCASE_LOC_ARMENIAN;
    908                }
    909            }
    910        }
    911    } else {
    912        // uppercase c
    913        // Same code as for lowercase c but also check for 'E'.
    914        if(c=='T') {
    915            /* tr or tur? */
    916            c=*locale++;
    917            if(is_u(c)) {
    918                c=*locale++;
    919            }
    920            if(is_r(c)) {
    921                c=*locale;
    922                if(is_sep(c)) {
    923                    return UCASE_LOC_TURKISH;
    924                }
    925            }
    926        } else if(c=='A') {
    927            /* az or aze? */
    928            c=*locale++;
    929            if(is_z(c)) {
    930                c=*locale++;
    931                if(is_e(c)) {
    932                    c=*locale;
    933                }
    934                if(is_sep(c)) {
    935                    return UCASE_LOC_TURKISH;
    936                }
    937            }
    938        } else if(c=='L') {
    939            /* lt or lit? */
    940            c=*locale++;
    941            if(is_i(c)) {
    942                c=*locale++;
    943            }
    944            if(is_t(c)) {
    945                c=*locale;
    946                if(is_sep(c)) {
    947                    return UCASE_LOC_LITHUANIAN;
    948                }
    949            }
    950        } else if(c=='E') {
    951            /* el or ell? */
    952            c=*locale++;
    953            if(is_l(c)) {
    954                c=*locale++;
    955                if(is_l(c)) {
    956                    c=*locale;
    957                }
    958                if(is_sep(c)) {
    959                    return UCASE_LOC_GREEK;
    960                }
    961            }
    962        } else if(c=='N') {
    963            /* nl or nld? */
    964            c=*locale++;
    965            if(is_l(c)) {
    966                c=*locale++;
    967                if(is_d(c)) {
    968                    c=*locale;
    969                }
    970                if(is_sep(c)) {
    971                    return UCASE_LOC_DUTCH;
    972                }
    973            }
    974        } else if(c=='H') {
    975            /* hy or hye? *not* hyw */
    976            c=*locale++;
    977            if(is_y(c)) {
    978                c=*locale++;
    979                if(is_e(c)) {
    980                    c=*locale;
    981                }
    982                if(is_sep(c)) {
    983                    return UCASE_LOC_ARMENIAN;
    984                }
    985            }
    986        }
    987    }
    988    return UCASE_LOC_ROOT;
    989 }
    990 
    991 /*
    992 * Is followed by
    993 *   {case-ignorable}* cased
    994 * ?
    995 * (dir determines looking forward/backward)
    996 * If a character is case-ignorable, it is skipped regardless of whether
    997 * it is also cased or not.
    998 */
    999 static UBool
   1000 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
   1001    UChar32 c;
   1002 
   1003    if(iter==nullptr) {
   1004        return false;
   1005    }
   1006 
   1007    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
   1008        int32_t type=ucase_getTypeOrIgnorable(c);
   1009        if(type&4) {
   1010            /* case-ignorable, continue with the loop */
   1011        } else if(type!=UCASE_NONE) {
   1012            return true; /* followed by cased letter */
   1013        } else {
   1014            return false; /* uncased and not case-ignorable */
   1015        }
   1016    }
   1017 
   1018    return false; /* not followed by cased letter */
   1019 }
   1020 
   1021 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
   1022 static UBool
   1023 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
   1024    UChar32 c;
   1025    int32_t dotType;
   1026    int8_t dir;
   1027 
   1028    if(iter==nullptr) {
   1029        return false;
   1030    }
   1031 
   1032    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
   1033        dotType=getDotType(c);
   1034        if(dotType==UCASE_SOFT_DOTTED) {
   1035            return true; /* preceded by TYPE_i */
   1036        } else if(dotType!=UCASE_OTHER_ACCENT) {
   1037            return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
   1038        }
   1039    }
   1040 
   1041    return false; /* not preceded by TYPE_i */
   1042 }
   1043 
   1044 /*
   1045 * See Jitterbug 2344:
   1046 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
   1047 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
   1048 * we made those releases compatible with Unicode 3.2 which had not fixed
   1049 * a related bug in SpecialCasing.txt.
   1050 *
   1051 * From the Jitterbug 2344 text:
   1052 * ... this bug is listed as a Unicode erratum
   1053 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
   1054 * <quote>
   1055 * There are two errors in SpecialCasing.txt.
   1056 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
   1057 * 2. An incorrect context definition. Correct as follows:
   1058 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
   1059 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
   1060 * ---
   1061 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   1062 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
   1063 * where the context After_I is defined as:
   1064 * The last preceding base character was an uppercase I, and there is no
   1065 * intervening combining character class 230 (ABOVE).
   1066 * </quote>
   1067 *
   1068 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
   1069 *
   1070 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   1071 * # This matches the behavior of the canonically equivalent I-dot_above
   1072 *
   1073 * See also the description in this place in older versions of uchar.c (revision 1.100).
   1074 *
   1075 * Markus W. Scherer 2003-feb-15
   1076 */
   1077 
   1078 /* Is preceded by base character 'I' with no intervening cc=230 ? */
   1079 static UBool
   1080 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
   1081    UChar32 c;
   1082    int32_t dotType;
   1083    int8_t dir;
   1084 
   1085    if(iter==nullptr) {
   1086        return false;
   1087    }
   1088 
   1089    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
   1090        if(c==0x49) {
   1091            return true; /* preceded by I */
   1092        }
   1093        dotType=getDotType(c);
   1094        if(dotType!=UCASE_OTHER_ACCENT) {
   1095            return false; /* preceded by different base character (not I), or intervening cc==230 */
   1096        }
   1097    }
   1098 
   1099    return false; /* not preceded by I */
   1100 }
   1101 
   1102 /* Is followed by one or more cc==230 ? */
   1103 static UBool
   1104 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
   1105    UChar32 c;
   1106    int32_t dotType;
   1107    int8_t dir;
   1108 
   1109    if(iter==nullptr) {
   1110        return false;
   1111    }
   1112 
   1113    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   1114        dotType=getDotType(c);
   1115        if(dotType==UCASE_ABOVE) {
   1116            return true; /* at least one cc==230 following */
   1117        } else if(dotType!=UCASE_OTHER_ACCENT) {
   1118            return false; /* next base character, no more cc==230 following */
   1119        }
   1120    }
   1121 
   1122    return false; /* no more cc==230 following */
   1123 }
   1124 
   1125 /* Is followed by a dot above (without cc==230 in between) ? */
   1126 static UBool
   1127 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
   1128    UChar32 c;
   1129    int32_t dotType;
   1130    int8_t dir;
   1131 
   1132    if(iter==nullptr) {
   1133        return false;
   1134    }
   1135 
   1136    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   1137        if(c==0x307) {
   1138            return true;
   1139        }
   1140        dotType=getDotType(c);
   1141        if(dotType!=UCASE_OTHER_ACCENT) {
   1142            return false; /* next base character or cc==230 in between */
   1143        }
   1144    }
   1145 
   1146    return false; /* no dot above following */
   1147 }
   1148 
   1149 U_CAPI int32_t U_EXPORT2
   1150 ucase_toFullLower(UChar32 c,
   1151                  UCaseContextIterator *iter, void *context,
   1152                  const char16_t **pString,
   1153                  int32_t loc) {
   1154    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
   1155    U_ASSERT(c >= 0);
   1156    UChar32 result=c;
   1157    // Reset the output pointer in case it was uninitialized.
   1158    *pString=nullptr;
   1159    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1160    if(!UCASE_HAS_EXCEPTION(props)) {
   1161        if(UCASE_IS_UPPER_OR_TITLE(props)) {
   1162            result=c+UCASE_GET_DELTA(props);
   1163        }
   1164    } else {
   1165        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
   1166        uint16_t excWord=*pe++;
   1167        int32_t full;
   1168 
   1169        pe2=pe;
   1170 
   1171        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   1172            /* use hardcoded conditions and mappings */
   1173 
   1174            /*
   1175             * Test for conditional mappings first
   1176             *   (otherwise the unconditional default mappings are always taken),
   1177             * then test for characters that have unconditional mappings in SpecialCasing.txt,
   1178             * then get the UnicodeData.txt mappings.
   1179             */
   1180            if( loc==UCASE_LOC_LITHUANIAN &&
   1181                    /* base characters, find accents above */
   1182                    (((c==0x49 || c==0x4a || c==0x12e) &&
   1183                        isFollowedByMoreAbove(iter, context)) ||
   1184                    /* precomposed with accent above, no need to find one */
   1185                    (c==0xcc || c==0xcd || c==0x128))
   1186            ) {
   1187                /*
   1188                    # Lithuanian
   1189 
   1190                    # Lithuanian retains the dot in a lowercase i when followed by accents.
   1191 
   1192                    # Introduce an explicit dot above when lowercasing capital I's and J's
   1193                    # whenever there are more accents above.
   1194                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
   1195 
   1196                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
   1197                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
   1198                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
   1199                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
   1200                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
   1201                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
   1202                 */
   1203                switch(c) {
   1204                case 0x49:  /* LATIN CAPITAL LETTER I */
   1205                    *pString=iDot;
   1206                    return 2;
   1207                case 0x4a:  /* LATIN CAPITAL LETTER J */
   1208                    *pString=jDot;
   1209                    return 2;
   1210                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
   1211                    *pString=iOgonekDot;
   1212                    return 2;
   1213                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
   1214                    *pString=iDotGrave;
   1215                    return 3;
   1216                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
   1217                    *pString=iDotAcute;
   1218                    return 3;
   1219                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
   1220                    *pString=iDotTilde;
   1221                    return 3;
   1222                default:
   1223                    return 0; /* will not occur */
   1224                }
   1225            /* # Turkish and Azeri */
   1226            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
   1227                /*
   1228                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1229                    # The following rules handle those cases.
   1230 
   1231                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1232                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1233                 */
   1234                return 0x69;
   1235            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
   1236                /*
   1237                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   1238                    # This matches the behavior of the canonically equivalent I-dot_above
   1239 
   1240                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   1241                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
   1242                 */
   1243                return 0; /* remove the dot (continue without output) */
   1244            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
   1245                /*
   1246                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
   1247 
   1248                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
   1249                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
   1250                 */
   1251                return 0x131;
   1252            } else if(c==0x130) {
   1253                /*
   1254                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
   1255 
   1256                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1257                 */
   1258                *pString=iDot;
   1259                return 2;
   1260            } else if(  c==0x3a3 &&
   1261                        !isFollowedByCasedLetter(iter, context, 1) &&
   1262                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
   1263            ) {
   1264                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
   1265                /*
   1266                    # Special case for final form of sigma
   1267 
   1268                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
   1269                 */
   1270                return 0x3c2; /* greek small final sigma */
   1271            } else {
   1272                /* no known conditional special case mapping, use a normal mapping */
   1273            }
   1274        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1275            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1276            full&=UCASE_FULL_LOWER;
   1277            if(full!=0) {
   1278                /* set the output pointer to the lowercase mapping */
   1279                *pString=reinterpret_cast<const char16_t *>(pe+1);
   1280 
   1281                /* return the string length */
   1282                return full;
   1283            }
   1284        }
   1285 
   1286        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
   1287            int32_t delta;
   1288            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
   1289            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
   1290        }
   1291        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1292            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
   1293        }
   1294    }
   1295 
   1296    return (result==c) ? ~result : result;
   1297 }
   1298 
   1299 /* internal */
   1300 static int32_t
   1301 toUpperOrTitle(UChar32 c,
   1302               UCaseContextIterator *iter, void *context,
   1303               const char16_t **pString,
   1304               int32_t loc,
   1305               UBool upperNotTitle) {
   1306    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
   1307    U_ASSERT(c >= 0);
   1308    UChar32 result=c;
   1309    // Reset the output pointer in case it was uninitialized.
   1310    *pString=nullptr;
   1311    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1312    if(!UCASE_HAS_EXCEPTION(props)) {
   1313        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   1314            result=c+UCASE_GET_DELTA(props);
   1315        }
   1316    } else {
   1317        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
   1318        uint16_t excWord=*pe++;
   1319        int32_t full, idx;
   1320 
   1321        pe2=pe;
   1322 
   1323        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   1324            /* use hardcoded conditions and mappings */
   1325            if(loc==UCASE_LOC_TURKISH && c==0x69) {
   1326                /*
   1327                    # Turkish and Azeri
   1328 
   1329                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1330                    # The following rules handle those cases.
   1331 
   1332                    # When uppercasing, i turns into a dotted capital I
   1333 
   1334                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   1335                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   1336                */
   1337                return 0x130;
   1338            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
   1339                /*
   1340                    # Lithuanian
   1341 
   1342                    # Lithuanian retains the dot in a lowercase i when followed by accents.
   1343 
   1344                    # Remove DOT ABOVE after "i" with upper or titlecase
   1345 
   1346                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   1347                 */
   1348                return 0; /* remove the dot (continue without output) */
   1349            } else if(c==0x0587) {
   1350                // See ICU-13416:
   1351                // և ligature ech-yiwn
   1352                // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
   1353                // but to ԵՎ=ech+vew in Eastern Armenian.
   1354                if(loc==UCASE_LOC_ARMENIAN) {
   1355                    *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
   1356                } else {
   1357                    *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
   1358                }
   1359                return 2;
   1360            } else {
   1361                /* no known conditional special case mapping, use a normal mapping */
   1362            }
   1363        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1364            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1365 
   1366            /* start of full case mapping strings */
   1367            ++pe;
   1368 
   1369            /* skip the lowercase and case-folding result strings */
   1370            pe+=full&UCASE_FULL_LOWER;
   1371            full>>=4;
   1372            pe+=full&0xf;
   1373            full>>=4;
   1374 
   1375            if(upperNotTitle) {
   1376                full&=0xf;
   1377            } else {
   1378                /* skip the uppercase result string */
   1379                pe+=full&0xf;
   1380                full=(full>>4)&0xf;
   1381            }
   1382 
   1383            if(full!=0) {
   1384                /* set the output pointer to the result string */
   1385                *pString=reinterpret_cast<const char16_t *>(pe);
   1386 
   1387                /* return the string length */
   1388                return full;
   1389            }
   1390        }
   1391 
   1392        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
   1393            int32_t delta;
   1394            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
   1395            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
   1396        }
   1397        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   1398            idx=UCASE_EXC_TITLE;
   1399        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   1400            /* here, titlecase is same as uppercase */
   1401            idx=UCASE_EXC_UPPER;
   1402        } else {
   1403            return ~c;
   1404        }
   1405        GET_SLOT_VALUE(excWord, idx, pe2, result);
   1406    }
   1407 
   1408    return (result==c) ? ~result : result;
   1409 }
   1410 
   1411 U_CAPI int32_t U_EXPORT2
   1412 ucase_toFullUpper(UChar32 c,
   1413                  UCaseContextIterator *iter, void *context,
   1414                  const char16_t **pString,
   1415                  int32_t caseLocale) {
   1416    return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
   1417 }
   1418 
   1419 U_CAPI int32_t U_EXPORT2
   1420 ucase_toFullTitle(UChar32 c,
   1421                  UCaseContextIterator *iter, void *context,
   1422                  const char16_t **pString,
   1423                  int32_t caseLocale) {
   1424    return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
   1425 }
   1426 
   1427 /* case folding ------------------------------------------------------------- */
   1428 
   1429 /*
   1430 * Case folding is similar to lowercasing.
   1431 * The result may be a simple mapping, i.e., a single code point, or
   1432 * a full mapping, i.e., a string.
   1433 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1434 * then only the lowercase mapping is stored.
   1435 *
   1436 * Some special cases are hardcoded because their conditions cannot be
   1437 * parsed and processed from CaseFolding.txt.
   1438 *
   1439 * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1440 
   1441 # C: common case folding, common mappings shared by both simple and full mappings.
   1442 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1443 # S: simple case folding, mappings to single characters where different from F.
   1444 # T: special case for uppercase I and dotted uppercase I
   1445 #    - For non-Turkic languages, this mapping is normally not used.
   1446 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1447 #
   1448 # Usage:
   1449 #  A. To do a simple case folding, use the mappings with status C + S.
   1450 #  B. To do a full case folding, use the mappings with status C + F.
   1451 #
   1452 #    The mappings with status T can be used or omitted depending on the desired case-folding
   1453 #    behavior. (The default option is to exclude them.)
   1454 
   1455 * Unicode 3.2 has 'T' mappings as follows:
   1456 
   1457 0049; T; 0131; # LATIN CAPITAL LETTER I
   1458 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1459 
   1460 * while the default mappings for these code points are:
   1461 
   1462 0049; C; 0069; # LATIN CAPITAL LETTER I
   1463 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1464 
   1465 * U+0130 has no simple case folding (simple-case-folds to itself).
   1466 */
   1467 
   1468 /* return the simple case folding mapping for c */
   1469 U_CAPI UChar32 U_EXPORT2
   1470 ucase_fold(UChar32 c, uint32_t options) {
   1471    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1472    if(!UCASE_HAS_EXCEPTION(props)) {
   1473        if(UCASE_IS_UPPER_OR_TITLE(props)) {
   1474            c+=UCASE_GET_DELTA(props);
   1475        }
   1476    } else {
   1477        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
   1478        uint16_t excWord=*pe++;
   1479        int32_t idx;
   1480        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1481            /* special case folding mappings, hardcoded */
   1482            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1483                /* default mappings */
   1484                if(c==0x49) {
   1485                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1486                    return 0x69;
   1487                } else if(c==0x130) {
   1488                    /* no simple case folding for U+0130 */
   1489                    return c;
   1490                }
   1491            } else {
   1492                /* Turkic mappings */
   1493                if(c==0x49) {
   1494                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1495                    return 0x131;
   1496                } else if(c==0x130) {
   1497                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1498                    return 0x69;
   1499                }
   1500            }
   1501        }
   1502        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
   1503            return c;
   1504        }
   1505        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
   1506            int32_t delta;
   1507            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
   1508            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
   1509        }
   1510        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1511            idx=UCASE_EXC_FOLD;
   1512        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1513            idx=UCASE_EXC_LOWER;
   1514        } else {
   1515            return c;
   1516        }
   1517        GET_SLOT_VALUE(excWord, idx, pe, c);
   1518    }
   1519    return c;
   1520 }
   1521 
   1522 /*
   1523 * Issue for canonical caseless match (UAX #21):
   1524 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1525 * canonical equivalence, unlike default-option casefolding.
   1526 * For example, I-grave and I + grave fold to strings that are not canonically
   1527 * equivalent.
   1528 * For more details, see the comment in unorm_compare() in unorm.cpp
   1529 * and the intermediate prototype changes for Jitterbug 2021.
   1530 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1531 *
   1532 * This did not get fixed because it appears that it is not possible to fix
   1533 * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1534 * together in a way that they still fold to common result strings.
   1535 */
   1536 
   1537 U_CAPI int32_t U_EXPORT2
   1538 ucase_toFullFolding(UChar32 c,
   1539                    const char16_t **pString,
   1540                    uint32_t options) {
   1541    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
   1542    U_ASSERT(c >= 0);
   1543    UChar32 result=c;
   1544    // Reset the output pointer in case it was uninitialized.
   1545    *pString=nullptr;
   1546    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1547    if(!UCASE_HAS_EXCEPTION(props)) {
   1548        if(UCASE_IS_UPPER_OR_TITLE(props)) {
   1549            result=c+UCASE_GET_DELTA(props);
   1550        }
   1551    } else {
   1552        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
   1553        uint16_t excWord=*pe++;
   1554        int32_t full, idx;
   1555 
   1556        pe2=pe;
   1557 
   1558        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1559            /* use hardcoded conditions and mappings */
   1560            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1561                /* default mappings */
   1562                if(c==0x49) {
   1563                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1564                    return 0x69;
   1565                } else if(c==0x130) {
   1566                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1567                    *pString=iDot;
   1568                    return 2;
   1569                }
   1570            } else {
   1571                /* Turkic mappings */
   1572                if(c==0x49) {
   1573                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1574                    return 0x131;
   1575                } else if(c==0x130) {
   1576                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1577                    return 0x69;
   1578                }
   1579            }
   1580        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1581            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1582 
   1583            /* start of full case mapping strings */
   1584            ++pe;
   1585 
   1586            /* skip the lowercase result string */
   1587            pe+=full&UCASE_FULL_LOWER;
   1588            full=(full>>4)&0xf;
   1589 
   1590            if(full!=0) {
   1591                /* set the output pointer to the result string */
   1592                *pString=reinterpret_cast<const char16_t *>(pe);
   1593 
   1594                /* return the string length */
   1595                return full;
   1596            }
   1597        }
   1598 
   1599        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
   1600            return ~c;
   1601        }
   1602        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
   1603            int32_t delta;
   1604            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
   1605            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
   1606        }
   1607        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1608            idx=UCASE_EXC_FOLD;
   1609        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1610            idx=UCASE_EXC_LOWER;
   1611        } else {
   1612            return ~c;
   1613        }
   1614        GET_SLOT_VALUE(excWord, idx, pe2, result);
   1615    }
   1616 
   1617    return (result==c) ? ~result : result;
   1618 }
   1619 
   1620 /* case mapping properties API ---------------------------------------------- */
   1621 
   1622 /* public API (see uchar.h) */
   1623 
   1624 U_CAPI UBool U_EXPORT2
   1625 u_isULowercase(UChar32 c) {
   1626    return UCASE_LOWER==ucase_getType(c);
   1627 }
   1628 
   1629 U_CAPI UBool U_EXPORT2
   1630 u_isUUppercase(UChar32 c) {
   1631    return UCASE_UPPER==ucase_getType(c);
   1632 }
   1633 
   1634 /* Transforms the Unicode character to its lower case equivalent.*/
   1635 U_CAPI UChar32 U_EXPORT2
   1636 u_tolower(UChar32 c) {
   1637    return ucase_tolower(c);
   1638 }
   1639 
   1640 /* Transforms the Unicode character to its upper case equivalent.*/
   1641 U_CAPI UChar32 U_EXPORT2
   1642 u_toupper(UChar32 c) {
   1643    return ucase_toupper(c);
   1644 }
   1645 
   1646 /* Transforms the Unicode character to its title case equivalent.*/
   1647 U_CAPI UChar32 U_EXPORT2
   1648 u_totitle(UChar32 c) {
   1649    return ucase_totitle(c);
   1650 }
   1651 
   1652 /* return the simple case folding mapping for c */
   1653 U_CAPI UChar32 U_EXPORT2
   1654 u_foldCase(UChar32 c, uint32_t options) {
   1655    return ucase_fold(c, options);
   1656 }
   1657 
   1658 U_CFUNC int32_t U_EXPORT2
   1659 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
   1660    /* case mapping properties */
   1661    const char16_t *resultString;
   1662    switch(which) {
   1663    case UCHAR_LOWERCASE:
   1664        return (UBool)(UCASE_LOWER==ucase_getType(c));
   1665    case UCHAR_UPPERCASE:
   1666        return (UBool)(UCASE_UPPER==ucase_getType(c));
   1667    case UCHAR_SOFT_DOTTED:
   1668        return ucase_isSoftDotted(c);
   1669    case UCHAR_CASE_SENSITIVE:
   1670        return ucase_isCaseSensitive(c);
   1671    case UCHAR_CASED:
   1672        return (UBool)(UCASE_NONE!=ucase_getType(c));
   1673    case UCHAR_CASE_IGNORABLE:
   1674        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
   1675    /*
   1676     * Note: The following Changes_When_Xyz are defined as testing whether
   1677     * the NFD form of the input changes when Xyz-case-mapped.
   1678     * However, this simpler implementation of these properties,
   1679     * ignoring NFD, passes the tests.
   1680     * The implementation needs to be changed if the tests start failing.
   1681     * When that happens, optimizations should be used to work with the
   1682     * per-single-code point ucase_toFullXyz() functions unless
   1683     * the NFD form has more than one code point,
   1684     * and the property starts set needs to be the union of the
   1685     * start sets for normalization and case mappings.
   1686     */
   1687    case UCHAR_CHANGES_WHEN_LOWERCASED:
   1688        return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
   1689    case UCHAR_CHANGES_WHEN_UPPERCASED:
   1690        return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
   1691    case UCHAR_CHANGES_WHEN_TITLECASED:
   1692        return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
   1693    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
   1694    case UCHAR_CHANGES_WHEN_CASEMAPPED:
   1695        return (UBool)(
   1696            ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
   1697            ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
   1698            ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
   1699    default:
   1700        return false;
   1701    }
   1702 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE