[ tor-browser ].git.dasho

ucasemap.cpp (37937B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2005-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucasemap.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2005may06
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Case mapping service object and functions using it.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/brkiter.h"
     23 #include "unicode/bytestream.h"
     24 #include "unicode/casemap.h"
     25 #include "unicode/edits.h"
     26 #include "unicode/stringoptions.h"
     27 #include "unicode/stringpiece.h"
     28 #include "unicode/ubrk.h"
     29 #include "unicode/uloc.h"
     30 #include "unicode/ustring.h"
     31 #include "unicode/ucasemap.h"
     32 #if !UCONFIG_NO_BREAK_ITERATION
     33 #include "unicode/utext.h"
     34 #endif
     35 #include "unicode/utf.h"
     36 #include "unicode/utf8.h"
     37 #include "unicode/utf16.h"
     38 #include "bytesinkutil.h"
     39 #include "cmemory.h"
     40 #include "cstring.h"
     41 #include "uassert.h"
     42 #include "ucase.h"
     43 #include "ucasemap_imp.h"
     44 
     45 U_NAMESPACE_USE
     46 
     47 /* UCaseMap service object -------------------------------------------------- */
     48 
     49 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
     50 #if !UCONFIG_NO_BREAK_ITERATION
     51        iter(nullptr),
     52 #endif
     53        caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
     54    ucasemap_setLocale(this, localeID, pErrorCode);
     55 }
     56 
     57 UCaseMap::~UCaseMap() {
     58 #if !UCONFIG_NO_BREAK_ITERATION
     59    delete iter;
     60 #endif
     61 }
     62 
     63 U_CAPI UCaseMap * U_EXPORT2
     64 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
     65    if(U_FAILURE(*pErrorCode)) {
     66        return nullptr;
     67    }
     68    UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
     69    if(csm==nullptr) {
     70        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
     71        return nullptr;
     72    } else if (U_FAILURE(*pErrorCode)) {
     73        delete csm;
     74        return nullptr;
     75    }
     76    return csm;
     77 }
     78 
     79 U_CAPI void U_EXPORT2
     80 ucasemap_close(UCaseMap *csm) {
     81    delete csm;
     82 }
     83 
     84 U_CAPI const char * U_EXPORT2
     85 ucasemap_getLocale(const UCaseMap *csm) {
     86    return csm->locale;
     87 }
     88 
     89 U_CAPI uint32_t U_EXPORT2
     90 ucasemap_getOptions(const UCaseMap *csm) {
     91    return csm->options;
     92 }
     93 
     94 U_CAPI void U_EXPORT2
     95 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
     96    if(U_FAILURE(*pErrorCode)) {
     97        return;
     98    }
     99    if (locale != nullptr && *locale == 0) {
    100        csm->locale[0] = 0;
    101        csm->caseLocale = UCASE_LOC_ROOT;
    102        return;
    103    }
    104 
    105    UErrorCode bufferStatus = U_ZERO_ERROR;
    106    int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus);
    107    if(bufferStatus==U_BUFFER_OVERFLOW_ERROR || (U_SUCCESS(bufferStatus) && length==sizeof(csm->locale))) {
    108        bufferStatus = U_ZERO_ERROR;
    109        /* we only really need the language code for case mappings */
    110        length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), &bufferStatus);
    111    }
    112    if(U_FAILURE(bufferStatus)) {
    113        *pErrorCode=bufferStatus;
    114    } else if(length==sizeof(csm->locale)) {
    115        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    116    }
    117    if(U_SUCCESS(*pErrorCode)) {     
    118        csm->caseLocale = ucase_getCaseLocale(csm->locale);
    119    } else {
    120        csm->locale[0]=0;
    121        csm->caseLocale = UCASE_LOC_ROOT;
    122    }
    123 }
    124 
    125 U_CAPI void U_EXPORT2
    126 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
    127    if(U_FAILURE(*pErrorCode)) {
    128        return;
    129    }
    130    csm->options=options;
    131 }
    132 
    133 /* UTF-8 string case mappings ----------------------------------------------- */
    134 
    135 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
    136 
    137 namespace {
    138 
    139 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
    140 inline UBool
    141 appendResult(int32_t cpLength, int32_t result, const char16_t *s,
    142             ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
    143    U_ASSERT(U_SUCCESS(errorCode));
    144 
    145    /* decode the result */
    146    if(result<0) {
    147        /* (not) original code point */
    148        if(edits!=nullptr) {
    149            edits->addUnchanged(cpLength);
    150        }
    151        if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
    152            ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
    153        }
    154    } else {
    155        if(result<=UCASE_MAX_STRING_LENGTH) {
    156            // string: "result" is the UTF-16 length
    157            return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
    158        } else {
    159            ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
    160        }
    161    }
    162    return true;
    163 }
    164 
    165 // See unicode/utf8.h U8_APPEND_UNSAFE().
    166 inline uint8_t getTwoByteLead(UChar32 c) { return static_cast<uint8_t>((c >> 6) | 0xc0); }
    167 inline uint8_t getTwoByteTrail(UChar32 c) { return static_cast<uint8_t>((c & 0x3f) | 0x80); }
    168 
    169 UChar32 U_CALLCONV
    170 utf8_caseContextIterator(void *context, int8_t dir) {
    171    UCaseContext* csc = static_cast<UCaseContext*>(context);
    172    UChar32 c;
    173 
    174    if(dir<0) {
    175        /* reset for backward iteration */
    176        csc->index=csc->cpStart;
    177        csc->dir=dir;
    178    } else if(dir>0) {
    179        /* reset for forward iteration */
    180        csc->index=csc->cpLimit;
    181        csc->dir=dir;
    182    } else {
    183        /* continue current iteration direction */
    184        dir=csc->dir;
    185    }
    186 
    187    if(dir<0) {
    188        if(csc->start<csc->index) {
    189            U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
    190            return c;
    191        }
    192    } else {
    193        if(csc->index<csc->limit) {
    194            U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
    195            return c;
    196        }
    197    }
    198    return U_SENTINEL;
    199 }
    200 
    201 /**
    202 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
    203 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
    204 */
    205 void toLower(int32_t caseLocale, uint32_t options,
    206             const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
    207             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
    208    const int8_t *latinToLower;
    209    if (caseLocale == UCASE_LOC_ROOT ||
    210            (caseLocale >= 0 ?
    211                !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
    212                (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
    213        latinToLower = LatinCase::TO_LOWER_NORMAL;
    214    } else {
    215        latinToLower = LatinCase::TO_LOWER_TR_LT;
    216    }
    217    const UTrie2 *trie = ucase_getTrie();
    218    int32_t prev = srcStart;
    219    int32_t srcIndex = srcStart;
    220    for (;;) {
    221        // fast path for simple cases
    222        int32_t cpStart;
    223        UChar32 c;
    224        for (;;) {
    225            if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
    226                c = U_SENTINEL;
    227                break;
    228            }
    229            uint8_t lead = src[srcIndex++];
    230            if (lead <= 0x7f) {
    231                int8_t d = latinToLower[lead];
    232                if (d == LatinCase::EXC) {
    233                    cpStart = srcIndex - 1;
    234                    c = lead;
    235                    break;
    236                }
    237                if (d == 0) { continue; }
    238                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
    239                                              sink, options, edits, errorCode);
    240                char ascii = static_cast<char>(lead + d);
    241                sink.Append(&ascii, 1);
    242                if (edits != nullptr) {
    243                    edits->addReplace(1, 1);
    244                }
    245                prev = srcIndex;
    246                continue;
    247            } else if (lead < 0xe3) {
    248                uint8_t t;
    249                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
    250                        (t = src[srcIndex] - 0x80) <= 0x3f) {
    251                    // U+0080..U+017F
    252                    ++srcIndex;
    253                    c = ((lead - 0xc0) << 6) | t;
    254                    int8_t d = latinToLower[c];
    255                    if (d == LatinCase::EXC) {
    256                        cpStart = srcIndex - 2;
    257                        break;
    258                    }
    259                    if (d == 0) { continue; }
    260                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
    261                                                  sink, options, edits, errorCode);
    262                    ByteSinkUtil::appendTwoBytes(c + d, sink);
    263                    if (edits != nullptr) {
    264                        edits->addReplace(2, 2);
    265                    }
    266                    prev = srcIndex;
    267                    continue;
    268                }
    269            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
    270                    (srcIndex + 2) <= srcLimit &&
    271                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
    272                // most of CJK: no case mappings
    273                srcIndex += 2;
    274                continue;
    275            }
    276            cpStart = --srcIndex;
    277            U8_NEXT(src, srcIndex, srcLimit, c);
    278            if (c < 0) {
    279                // ill-formed UTF-8
    280                continue;
    281            }
    282            uint16_t props = UTRIE2_GET16(trie, c);
    283            if (UCASE_HAS_EXCEPTION(props)) { break; }
    284            int32_t delta;
    285            if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
    286                continue;
    287            }
    288            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    289                                          sink, options, edits, errorCode);
    290            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
    291            prev = srcIndex;
    292        }
    293        if (c < 0) {
    294            break;
    295        }
    296        // slow path
    297        const char16_t *s;
    298        if (caseLocale >= 0) {
    299            csc->cpStart = cpStart;
    300            csc->cpLimit = srcIndex;
    301            c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
    302        } else {
    303            c = ucase_toFullFolding(c, &s, options);
    304        }
    305        if (c >= 0) {
    306            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    307                                          sink, options, edits, errorCode);
    308            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
    309            prev = srcIndex;
    310        }
    311    }
    312    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
    313                                  sink, options, edits, errorCode);
    314 }
    315 
    316 void toUpper(int32_t caseLocale, uint32_t options,
    317             const uint8_t *src, UCaseContext *csc, int32_t srcLength,
    318             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
    319    const int8_t *latinToUpper;
    320    if (caseLocale == UCASE_LOC_TURKISH) {
    321        latinToUpper = LatinCase::TO_UPPER_TR;
    322    } else {
    323        latinToUpper = LatinCase::TO_UPPER_NORMAL;
    324    }
    325    const UTrie2 *trie = ucase_getTrie();
    326    int32_t prev = 0;
    327    int32_t srcIndex = 0;
    328    for (;;) {
    329        // fast path for simple cases
    330        int32_t cpStart;
    331        UChar32 c;
    332        for (;;) {
    333            if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
    334                c = U_SENTINEL;
    335                break;
    336            }
    337            uint8_t lead = src[srcIndex++];
    338            if (lead <= 0x7f) {
    339                int8_t d = latinToUpper[lead];
    340                if (d == LatinCase::EXC) {
    341                    cpStart = srcIndex - 1;
    342                    c = lead;
    343                    break;
    344                }
    345                if (d == 0) { continue; }
    346                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
    347                                              sink, options, edits, errorCode);
    348                char ascii = static_cast<char>(lead + d);
    349                sink.Append(&ascii, 1);
    350                if (edits != nullptr) {
    351                    edits->addReplace(1, 1);
    352                }
    353                prev = srcIndex;
    354                continue;
    355            } else if (lead < 0xe3) {
    356                uint8_t t;
    357                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
    358                        (t = src[srcIndex] - 0x80) <= 0x3f) {
    359                    // U+0080..U+017F
    360                    ++srcIndex;
    361                    c = ((lead - 0xc0) << 6) | t;
    362                    int8_t d = latinToUpper[c];
    363                    if (d == LatinCase::EXC) {
    364                        cpStart = srcIndex - 2;
    365                        break;
    366                    }
    367                    if (d == 0) { continue; }
    368                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
    369                                                  sink, options, edits, errorCode);
    370                    ByteSinkUtil::appendTwoBytes(c + d, sink);
    371                    if (edits != nullptr) {
    372                        edits->addReplace(2, 2);
    373                    }
    374                    prev = srcIndex;
    375                    continue;
    376                }
    377            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
    378                    (srcIndex + 2) <= srcLength &&
    379                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
    380                // most of CJK: no case mappings
    381                srcIndex += 2;
    382                continue;
    383            }
    384            cpStart = --srcIndex;
    385            U8_NEXT(src, srcIndex, srcLength, c);
    386            if (c < 0) {
    387                // ill-formed UTF-8
    388                continue;
    389            }
    390            uint16_t props = UTRIE2_GET16(trie, c);
    391            if (UCASE_HAS_EXCEPTION(props)) { break; }
    392            int32_t delta;
    393            if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
    394                continue;
    395            }
    396            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    397                                          sink, options, edits, errorCode);
    398            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
    399            prev = srcIndex;
    400        }
    401        if (c < 0) {
    402            break;
    403        }
    404        // slow path
    405        csc->cpStart = cpStart;
    406        csc->cpLimit = srcIndex;
    407        const char16_t *s;
    408        c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
    409        if (c >= 0) {
    410            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    411                                          sink, options, edits, errorCode);
    412            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
    413            prev = srcIndex;
    414        }
    415    }
    416    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
    417                                  sink, options, edits, errorCode);
    418 }
    419 
    420 }  // namespace
    421 
    422 #if !UCONFIG_NO_BREAK_ITERATION
    423 
    424 namespace {
    425 
    426 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
    427 
    428 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
    429 
    430 /**
    431 * Input: c is a letter I with or without acute accent.
    432 * start is the index in src after c, and is less than segmentLimit.
    433 * If a plain i/I is followed by a plain j/J,
    434 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
    435 * then we output accordingly.
    436 *
    437 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
    438 */
    439 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
    440                          ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
    441    U_ASSERT(start < segmentLimit);
    442 
    443    int32_t index = start;
    444    bool withAcute = false;
    445 
    446    // If the conditions are met, then the following variables tell us what to output.
    447    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
    448    bool doTitleJ = false;  // true if the j needs to be titlecased
    449    int32_t unchanged2 = 0;  // after the j (0 or 1)
    450 
    451    // next character after the first letter
    452    UChar32 c2;
    453    c2 = src[index++];
    454 
    455    // Is the first letter an i/I with accent?
    456    if (c == u'I') {
    457        if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
    458            withAcute = true;
    459            unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
    460            if (index == segmentLimit) { return start; }
    461            c2 = src[index++];
    462        }
    463    } else {  // Í
    464        withAcute = true;
    465    }
    466 
    467    // Is the next character a j/J?
    468    if (c2 == u'j') {
    469        doTitleJ = true;
    470    } else if (c2 == u'J') {
    471        ++unchanged1;
    472    } else {
    473        return start;
    474    }
    475 
    476    // A plain i/I must be followed by a plain j/J.
    477    // An i/I with acute must be followed by a j/J with acute.
    478    if (withAcute) {
    479        if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
    480            return start;
    481        }
    482        if (doTitleJ) {
    483            unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
    484        } else {
    485            unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
    486        }
    487    }
    488 
    489    // There must not be another combining mark.
    490    if (index < segmentLimit) {
    491        int32_t cp;
    492        int32_t i = index;
    493        U8_NEXT(src, i, segmentLimit, cp);
    494        uint32_t typeMask = U_GET_GC_MASK(cp);
    495        if ((typeMask & U_GC_M_MASK) != 0) {
    496            return start;
    497        }
    498    }
    499 
    500    // Output the rest of the Dutch IJ.
    501    ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
    502    start += unchanged1;
    503    if (doTitleJ) {
    504        ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
    505        ++start;
    506    }
    507    ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
    508 
    509    U_ASSERT(start + unchanged2 == index);
    510    return index;
    511 }
    512 
    513 }  // namespace
    514 
    515 U_CFUNC void U_CALLCONV
    516 ucasemap_internalUTF8ToTitle(
    517        int32_t caseLocale, uint32_t options, BreakIterator *iter,
    518        const uint8_t *src, int32_t srcLength,
    519        ByteSink &sink, icu::Edits *edits,
    520        UErrorCode &errorCode) {
    521    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
    522        return;
    523    }
    524 
    525    /* set up local variables */
    526    UCaseContext csc=UCASECONTEXT_INITIALIZER;
    527    csc.p=(void *)src;
    528    csc.limit=srcLength;
    529    int32_t prev=0;
    530    UBool isFirstIndex=true;
    531 
    532    /* titlecasing loop */
    533    while(prev<srcLength) {
    534        /* find next index where to titlecase */
    535        int32_t index;
    536        if(isFirstIndex) {
    537            isFirstIndex=false;
    538            index=iter->first();
    539        } else {
    540            index=iter->next();
    541        }
    542        if(index==UBRK_DONE || index>srcLength) {
    543            index=srcLength;
    544        }
    545 
    546        /*
    547         * Segment [prev..index[ into 3 parts:
    548         * a) skipped characters (copy as-is) [prev..titleStart[
    549         * b) first letter (titlecase)              [titleStart..titleLimit[
    550         * c) subsequent characters (lowercase)                 [titleLimit..index[
    551         */
    552        if(prev<index) {
    553            /* find and copy skipped characters [prev..titleStart[ */
    554            int32_t titleStart=prev;
    555            int32_t titleLimit=prev;
    556            UChar32 c;
    557            U8_NEXT(src, titleLimit, index, c);
    558            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
    559                // Adjust the titlecasing index to the next cased character,
    560                // or to the next letter/number/symbol/private use.
    561                // Stop with titleStart<titleLimit<=index
    562                // if there is a character to be titlecased,
    563                // or else stop with titleStart==titleLimit==index.
    564                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
    565                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
    566                    titleStart=titleLimit;
    567                    if(titleLimit==index) {
    568                        break;
    569                    }
    570                    U8_NEXT(src, titleLimit, index, c);
    571                }
    572                if (prev < titleStart) {
    573                    if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
    574                                                       sink, options, edits, errorCode)) {
    575                        return;
    576                    }
    577                }
    578            }
    579 
    580            if(titleStart<titleLimit) {
    581                /* titlecase c which is from [titleStart..titleLimit[ */
    582                if(c>=0) {
    583                    csc.cpStart=titleStart;
    584                    csc.cpLimit=titleLimit;
    585                    const char16_t *s;
    586                    c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
    587                    if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
    588                        return;
    589                    }
    590                } else {
    591                    // Malformed UTF-8.
    592                    if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
    593                                                       sink, options, edits, errorCode)) {
    594                        return;
    595                    }
    596                }
    597 
    598                /* Special case Dutch IJ titlecasing */
    599                if (titleLimit < index &&
    600                    caseLocale == UCASE_LOC_DUTCH) {
    601                    if (c < 0) {
    602                        c = ~c;
    603                    }
    604 
    605                    if (c == u'I' || c == u'Í') {
    606                        titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
    607                    }
    608                }
    609 
    610                /* lowercase [titleLimit..index[ */
    611                if(titleLimit<index) {
    612                    if((options&U_TITLECASE_NO_LOWERCASE)==0) {
    613                        /* Normal operation: Lowercase the rest of the word. */
    614                        toLower(caseLocale, options,
    615                                src, &csc, titleLimit, index,
    616                                sink, edits, errorCode);
    617                        if(U_FAILURE(errorCode)) {
    618                            return;
    619                        }
    620                    } else {
    621                        /* Optionally just copy the rest of the word unchanged. */
    622                        if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
    623                                                           sink, options, edits, errorCode)) {
    624                            return;
    625                        }
    626                    }
    627                }
    628            }
    629        }
    630 
    631        prev=index;
    632    }
    633 }
    634 
    635 #endif
    636 
    637 U_NAMESPACE_BEGIN
    638 namespace GreekUpper {
    639 
    640 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
    641    while (i < length) {
    642        UChar32 c;
    643        U8_NEXT(s, i, length, c);
    644        int32_t type = ucase_getTypeOrIgnorable(c);
    645        if ((type & UCASE_IGNORABLE) != 0) {
    646            // Case-ignorable, continue with the loop.
    647        } else if (type != UCASE_NONE) {
    648            return true;  // Followed by cased letter.
    649        } else {
    650            return false;  // Uncased and not case-ignorable.
    651        }
    652    }
    653    return false;  // Not followed by cased letter.
    654 }
    655 
    656 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
    657 void toUpper(uint32_t options,
    658             const uint8_t *src, int32_t srcLength,
    659             ByteSink &sink, Edits *edits,
    660             UErrorCode &errorCode) {
    661    uint32_t state = 0;
    662    for (int32_t i = 0; i < srcLength;) {
    663        int32_t nextIndex = i;
    664        UChar32 c;
    665        U8_NEXT(src, nextIndex, srcLength, c);
    666        uint32_t nextState = 0;
    667        int32_t type = ucase_getTypeOrIgnorable(c);
    668        if ((type & UCASE_IGNORABLE) != 0) {
    669            // c is case-ignorable
    670            nextState |= (state & AFTER_CASED);
    671        } else if (type != UCASE_NONE) {
    672            // c is cased
    673            nextState |= AFTER_CASED;
    674        }
    675        uint32_t data = getLetterData(c);
    676        if (data > 0) {
    677            uint32_t upper = data & UPPER_MASK;
    678            // Add a dialytika to this iota or ypsilon vowel
    679            // if we removed a tonos from the previous vowel,
    680            // and that previous vowel did not also have (or gain) a dialytika.
    681            // Adding one only to the final vowel in a longer sequence
    682            // (which does not occur in normal writing) would require lookahead.
    683            // Set the same flag as for preserving an existing dialytika.
    684            if ((data & HAS_VOWEL) != 0 &&
    685                (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
    686                    0 &&
    687                (upper == 0x399 || upper == 0x3A5)) {
    688                data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
    689                                                                           : HAS_COMBINING_DIALYTIKA;
    690            }
    691            int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
    692            if ((data & HAS_YPOGEGRAMMENI) != 0) {
    693                numYpogegrammeni = 1;
    694            }
    695            const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
    696            // Skip combining diacritics after this Greek letter.
    697            int32_t nextNextIndex = nextIndex;
    698            while (nextIndex < srcLength) {
    699                UChar32 c2;
    700                U8_NEXT(src, nextNextIndex, srcLength, c2);
    701                uint32_t diacriticData = getDiacriticData(c2);
    702                if (diacriticData != 0) {
    703                    data |= diacriticData;
    704                    if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
    705                        ++numYpogegrammeni;
    706                    }
    707                    nextIndex = nextNextIndex;
    708                } else {
    709                    break;  // not a Greek diacritic
    710                }
    711            }
    712            if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
    713                nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
    714                                                  : AFTER_VOWEL_WITH_COMBINING_ACCENT;
    715            }
    716            // Map according to Greek rules.
    717            UBool addTonos = false;
    718            if (upper == 0x397 &&
    719                    (data & HAS_ACCENT) != 0 &&
    720                    numYpogegrammeni == 0 &&
    721                    (state & AFTER_CASED) == 0 &&
    722                    !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
    723                // Keep disjunctive "or" with (only) a tonos.
    724                // We use the same "word boundary" conditions as for the Final_Sigma test.
    725                if (hasPrecomposedAccent) {
    726                    upper = 0x389;  // Preserve the precomposed form.
    727                } else {
    728                    addTonos = true;
    729                }
    730            } else if ((data & HAS_DIALYTIKA) != 0) {
    731                // Preserve a vowel with dialytika in precomposed form if it exists.
    732                if (upper == 0x399) {
    733                    upper = 0x3AA;
    734                    data &= ~HAS_EITHER_DIALYTIKA;
    735                } else if (upper == 0x3A5) {
    736                    upper = 0x3AB;
    737                    data &= ~HAS_EITHER_DIALYTIKA;
    738                }
    739            }
    740 
    741            UBool change;
    742            if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
    743                change = true;  // common, simple usage
    744            } else {
    745                // Find out first whether we are changing the text.
    746                U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
    747                change = (i + 2) > nextIndex ||
    748                        src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
    749                        numYpogegrammeni > 0;
    750                int32_t i2 = i + 2;
    751                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
    752                    change |= (i2 + 2) > nextIndex ||
    753                            src[i2] != static_cast<uint8_t>(u8"\u0308"[0]) ||
    754                            src[i2 + 1] != static_cast<uint8_t>(u8"\u0308"[1]);
    755                    i2 += 2;
    756                }
    757                if (addTonos) {
    758                    change |= (i2 + 2) > nextIndex ||
    759                            src[i2] != static_cast<uint8_t>(u8"\u0301"[0]) ||
    760                            src[i2 + 1] != static_cast<uint8_t>(u8"\u0301"[1]);
    761                    i2 += 2;
    762                }
    763                int32_t oldLength = nextIndex - i;
    764                int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
    765                change |= oldLength != newLength;
    766                if (change) {
    767                    if (edits != nullptr) {
    768                        edits->addReplace(oldLength, newLength);
    769                    }
    770                } else {
    771                    if (edits != nullptr) {
    772                        edits->addUnchanged(oldLength);
    773                    }
    774                    // Write unchanged text?
    775                    change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
    776                }
    777            }
    778 
    779            if (change) {
    780                ByteSinkUtil::appendTwoBytes(upper, sink);
    781                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
    782                    sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
    783                }
    784                if (addTonos) {
    785                    sink.AppendU8(u8"\u0301", 2);
    786                }
    787                while (numYpogegrammeni > 0) {
    788                    sink.AppendU8(u8"\u0399", 2);
    789                    --numYpogegrammeni;
    790                }
    791            }
    792        } else if(c>=0) {
    793            const char16_t *s;
    794            c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
    795            if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
    796                return;
    797            }
    798        } else {
    799            // Malformed UTF-8.
    800            if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
    801                                               sink, options, edits, errorCode)) {
    802                return;
    803            }
    804        }
    805        i = nextIndex;
    806        state = nextState;
    807    }
    808 }
    809 
    810 }  // namespace GreekUpper
    811 U_NAMESPACE_END
    812 
    813 static void U_CALLCONV
    814 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    815                             const uint8_t *src, int32_t srcLength,
    816                             icu::ByteSink &sink, icu::Edits *edits,
    817                             UErrorCode &errorCode) {
    818    UCaseContext csc=UCASECONTEXT_INITIALIZER;
    819    csc.p=(void *)src;
    820    csc.limit=srcLength;
    821    toLower(
    822        caseLocale, options,
    823        src, &csc, 0, srcLength,
    824        sink, edits, errorCode);
    825 }
    826 
    827 static void U_CALLCONV
    828 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    829                             const uint8_t *src, int32_t srcLength,
    830                             icu::ByteSink &sink, icu::Edits *edits,
    831                             UErrorCode &errorCode) {
    832    if (caseLocale == UCASE_LOC_GREEK) {
    833        GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
    834    } else {
    835        UCaseContext csc=UCASECONTEXT_INITIALIZER;
    836        csc.p=(void *)src;
    837        csc.limit=srcLength;
    838        toUpper(
    839            caseLocale, options,
    840            src, &csc, srcLength,
    841            sink, edits, errorCode);
    842    }
    843 }
    844 
    845 static void U_CALLCONV
    846 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    847                          const uint8_t *src, int32_t srcLength,
    848                          icu::ByteSink &sink, icu::Edits *edits,
    849                          UErrorCode &errorCode) {
    850    toLower(
    851        -1, options,
    852        src, nullptr, 0, srcLength,
    853        sink, edits, errorCode);
    854 }
    855 
    856 void
    857 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    858                 const char *src, int32_t srcLength,
    859                 UTF8CaseMapper *stringCaseMapper,
    860                 icu::ByteSink &sink, icu::Edits *edits,
    861                 UErrorCode &errorCode) {
    862    /* check argument values */
    863    if (U_FAILURE(errorCode)) {
    864        return;
    865    }
    866    if ((src == nullptr && srcLength != 0) || srcLength < -1) {
    867        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    868        return;
    869    }
    870 
    871    // Get the string length.
    872    if (srcLength == -1) {
    873        srcLength = static_cast<int32_t>(uprv_strlen(src));
    874    }
    875 
    876    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
    877        edits->reset();
    878    }
    879    stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
    880                     reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, errorCode);
    881    sink.Flush();
    882    if (U_SUCCESS(errorCode)) {
    883        if (edits != nullptr) {
    884            edits->copyErrorTo(errorCode);
    885        }
    886    }
    887 }
    888 
    889 int32_t
    890 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    891                 char *dest, int32_t destCapacity,
    892                 const char *src, int32_t srcLength,
    893                 UTF8CaseMapper *stringCaseMapper,
    894                 icu::Edits *edits,
    895                 UErrorCode &errorCode) {
    896    /* check argument values */
    897    if(U_FAILURE(errorCode)) {
    898        return 0;
    899    }
    900    if( destCapacity<0 ||
    901        (dest==nullptr && destCapacity>0) ||
    902        (src==nullptr && srcLength!=0) || srcLength<-1
    903    ) {
    904        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    905        return 0;
    906    }
    907 
    908    /* get the string length */
    909    if(srcLength==-1) {
    910        srcLength = static_cast<int32_t>(uprv_strlen(src));
    911    }
    912 
    913    /* check for overlapping source and destination */
    914    if( dest!=nullptr &&
    915        ((src>=dest && src<(dest+destCapacity)) ||
    916         (dest>=src && dest<(src+srcLength)))
    917    ) {
    918        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    919        return 0;
    920    }
    921 
    922    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
    923        edits->reset();
    924    }
    925    int32_t reslen = ByteSinkUtil::viaByteSinkToTerminatedChars(
    926        dest, destCapacity,
    927        [&](ByteSink& sink, UErrorCode& status) {
    928            stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
    929                             reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, status);
    930        },
    931        errorCode);
    932    if (U_SUCCESS(errorCode) && edits != nullptr) {
    933        edits->copyErrorTo(errorCode);
    934    }
    935    return reslen;
    936 }
    937 
    938 /* public API functions */
    939 
    940 U_CAPI int32_t U_EXPORT2
    941 ucasemap_utf8ToLower(const UCaseMap *csm,
    942                     char *dest, int32_t destCapacity,
    943                     const char *src, int32_t srcLength,
    944                     UErrorCode *pErrorCode) {
    945    return ucasemap_mapUTF8(
    946        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    947        dest, destCapacity,
    948        src, srcLength,
    949        ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
    950 }
    951 
    952 U_CAPI int32_t U_EXPORT2
    953 ucasemap_utf8ToUpper(const UCaseMap *csm,
    954                     char *dest, int32_t destCapacity,
    955                     const char *src, int32_t srcLength,
    956                     UErrorCode *pErrorCode) {
    957    return ucasemap_mapUTF8(
    958        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    959        dest, destCapacity,
    960        src, srcLength,
    961        ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
    962 }
    963 
    964 U_CAPI int32_t U_EXPORT2
    965 ucasemap_utf8FoldCase(const UCaseMap *csm,
    966                      char *dest, int32_t destCapacity,
    967                      const char *src, int32_t srcLength,
    968                      UErrorCode *pErrorCode) {
    969    return ucasemap_mapUTF8(
    970        UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    971        dest, destCapacity,
    972        src, srcLength,
    973        ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
    974 }
    975 
    976 U_NAMESPACE_BEGIN
    977 
    978 void CaseMap::utf8ToLower(
    979        const char *locale, uint32_t options,
    980        StringPiece src, ByteSink &sink, Edits *edits,
    981        UErrorCode &errorCode) {
    982    ucasemap_mapUTF8(
    983        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    984        src.data(), src.length(),
    985        ucasemap_internalUTF8ToLower, sink, edits, errorCode);
    986 }
    987 
    988 void CaseMap::utf8ToUpper(
    989        const char *locale, uint32_t options,
    990        StringPiece src, ByteSink &sink, Edits *edits,
    991        UErrorCode &errorCode) {
    992    ucasemap_mapUTF8(
    993        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    994        src.data(), src.length(),
    995        ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
    996 }
    997 
    998 void CaseMap::utf8Fold(
    999        uint32_t options,
   1000        StringPiece src, ByteSink &sink, Edits *edits,
   1001        UErrorCode &errorCode) {
   1002    ucasemap_mapUTF8(
   1003        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1004        src.data(), src.length(),
   1005        ucasemap_internalUTF8Fold, sink, edits, errorCode);
   1006 }
   1007 
   1008 int32_t CaseMap::utf8ToLower(
   1009        const char *locale, uint32_t options,
   1010        const char *src, int32_t srcLength,
   1011        char *dest, int32_t destCapacity, Edits *edits,
   1012        UErrorCode &errorCode) {
   1013    return ucasemap_mapUTF8(
   1014        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
   1015        dest, destCapacity,
   1016        src, srcLength,
   1017        ucasemap_internalUTF8ToLower, edits, errorCode);
   1018 }
   1019 
   1020 int32_t CaseMap::utf8ToUpper(
   1021        const char *locale, uint32_t options,
   1022        const char *src, int32_t srcLength,
   1023        char *dest, int32_t destCapacity, Edits *edits,
   1024        UErrorCode &errorCode) {
   1025    return ucasemap_mapUTF8(
   1026        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
   1027        dest, destCapacity,
   1028        src, srcLength,
   1029        ucasemap_internalUTF8ToUpper, edits, errorCode);
   1030 }
   1031 
   1032 int32_t CaseMap::utf8Fold(
   1033        uint32_t options,
   1034        const char *src, int32_t srcLength,
   1035        char *dest, int32_t destCapacity, Edits *edits,
   1036        UErrorCode &errorCode) {
   1037    return ucasemap_mapUTF8(
   1038        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1039        dest, destCapacity,
   1040        src, srcLength,
   1041        ucasemap_internalUTF8Fold, edits, errorCode);
   1042 }
   1043 
   1044 U_NAMESPACE_END
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE