tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

normalizer2.cpp (18830B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2009-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  normalizer2.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2009nov22
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_NORMALIZATION
     22 
     23 #include "unicode/edits.h"
     24 #include "unicode/normalizer2.h"
     25 #include "unicode/stringoptions.h"
     26 #include "unicode/unistr.h"
     27 #include "unicode/unorm.h"
     28 #include "cstring.h"
     29 #include "mutex.h"
     30 #include "norm2allmodes.h"
     31 #include "normalizer2impl.h"
     32 #include "uassert.h"
     33 #include "ucln_cmn.h"
     34 
     35 using icu::Normalizer2Impl;
     36 
     37 #if NORM2_HARDCODE_NFC_DATA
     38 // NFC/NFD data machine-generated by gennorm2 --csource
     39 #define INCLUDED_FROM_NORMALIZER2_CPP
     40 #include "norm2_nfc_data.h"
     41 #endif
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
     46 
     47 Normalizer2::~Normalizer2() {}
     48 
     49 void
     50 Normalizer2::normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
     51                           Edits *edits, UErrorCode &errorCode) const {
     52    if (U_FAILURE(errorCode)) {
     53        return;
     54    }
     55    if (edits != nullptr) {
     56        errorCode = U_UNSUPPORTED_ERROR;
     57        return;
     58    }
     59    UnicodeString src16 = UnicodeString::fromUTF8(src);
     60    normalize(src16, errorCode).toUTF8(sink);
     61 }
     62 
     63 UBool
     64 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
     65    return false;
     66 }
     67 
     68 UChar32
     69 Normalizer2::composePair(UChar32, UChar32) const {
     70    return U_SENTINEL;
     71 }
     72 
     73 uint8_t
     74 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
     75    return 0;
     76 }
     77 
     78 UBool
     79 Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
     80    return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
     81 }
     82 
     83 // Normalizer2 implementation for the old UNORM_NONE.
     84 class NoopNormalizer2 : public Normalizer2 {
     85    virtual ~NoopNormalizer2();
     86 
     87    virtual UnicodeString &
     88    normalize(const UnicodeString &src,
     89              UnicodeString &dest,
     90              UErrorCode &errorCode) const override {
     91        if(U_SUCCESS(errorCode)) {
     92            if(&dest!=&src) {
     93                dest=src;
     94            } else {
     95                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     96            }
     97        }
     98        return dest;
     99    }
    100    virtual void
    101    normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
    102                  Edits *edits, UErrorCode &errorCode) const override {
    103        if(U_SUCCESS(errorCode)) {
    104            if (edits != nullptr) {
    105                if ((options & U_EDITS_NO_RESET) == 0) {
    106                    edits->reset();
    107                }
    108                edits->addUnchanged(src.length());
    109            }
    110            if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
    111                sink.Append(src.data(), src.length());
    112            }
    113            sink.Flush();
    114        }
    115    }
    116 
    117    virtual UnicodeString &
    118    normalizeSecondAndAppend(UnicodeString &first,
    119                             const UnicodeString &second,
    120                             UErrorCode &errorCode) const override {
    121        if(U_SUCCESS(errorCode)) {
    122            if(&first!=&second) {
    123                first.append(second);
    124            } else {
    125                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    126            }
    127        }
    128        return first;
    129    }
    130    virtual UnicodeString &
    131    append(UnicodeString &first,
    132           const UnicodeString &second,
    133           UErrorCode &errorCode) const override {
    134        if(U_SUCCESS(errorCode)) {
    135            if(&first!=&second) {
    136                first.append(second);
    137            } else {
    138                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    139            }
    140        }
    141        return first;
    142    }
    143    virtual UBool
    144    getDecomposition(UChar32, UnicodeString &) const override {
    145        return false;
    146    }
    147    // No need to override the default getRawDecomposition().
    148    virtual UBool
    149    isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
    150        return U_SUCCESS(errorCode);
    151    }
    152    virtual UBool
    153    isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
    154        return U_SUCCESS(errorCode);
    155    }
    156    virtual UNormalizationCheckResult
    157    quickCheck(const UnicodeString &, UErrorCode &) const override {
    158        return UNORM_YES;
    159    }
    160    virtual int32_t
    161    spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const override {
    162        return s.length();
    163    }
    164    virtual UBool hasBoundaryBefore(UChar32) const override { return true; }
    165    virtual UBool hasBoundaryAfter(UChar32) const override { return true; }
    166    virtual UBool isInert(UChar32) const override { return true; }
    167 };
    168 
    169 NoopNormalizer2::~NoopNormalizer2() {}
    170 
    171 Normalizer2WithImpl::~Normalizer2WithImpl() {}
    172 
    173 DecomposeNormalizer2::~DecomposeNormalizer2() {}
    174 
    175 ComposeNormalizer2::~ComposeNormalizer2() {}
    176 
    177 FCDNormalizer2::~FCDNormalizer2() {}
    178 
    179 // instance cache ---------------------------------------------------------- ***
    180 
    181 U_CDECL_BEGIN
    182 static UBool U_CALLCONV uprv_normalizer2_cleanup();
    183 U_CDECL_END
    184 
    185 static Normalizer2   *noopSingleton;
    186 static icu::UInitOnce noopInitOnce {};
    187 
    188 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
    189    if(U_FAILURE(errorCode)) {
    190        return;
    191    }
    192    noopSingleton=new NoopNormalizer2;
    193    if(noopSingleton==nullptr) {
    194        errorCode=U_MEMORY_ALLOCATION_ERROR;
    195        return;
    196    }
    197    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    198 }
    199 
    200 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
    201    if(U_FAILURE(errorCode)) { return nullptr; }
    202    umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
    203    return noopSingleton;
    204 }
    205 
    206 const Normalizer2Impl *
    207 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
    208    return &((Normalizer2WithImpl *)norm2)->impl;
    209 }
    210 
    211 Norm2AllModes::~Norm2AllModes() {
    212    delete impl;
    213 }
    214 
    215 Norm2AllModes *
    216 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
    217    if(U_FAILURE(errorCode)) {
    218        delete impl;
    219        return nullptr;
    220    }
    221    Norm2AllModes *allModes=new Norm2AllModes(impl);
    222    if(allModes==nullptr) {
    223        errorCode=U_MEMORY_ALLOCATION_ERROR;
    224        delete impl;
    225        return nullptr;
    226    }
    227    return allModes;
    228 }
    229 
    230 #if NORM2_HARDCODE_NFC_DATA
    231 Norm2AllModes *
    232 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
    233    if(U_FAILURE(errorCode)) {
    234        return nullptr;
    235    }
    236    Normalizer2Impl *impl=new Normalizer2Impl;
    237    if(impl==nullptr) {
    238        errorCode=U_MEMORY_ALLOCATION_ERROR;
    239        return nullptr;
    240    }
    241    impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
    242               norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
    243    return createInstance(impl, errorCode);
    244 }
    245 
    246 static Norm2AllModes *nfcSingleton;
    247 
    248 static icu::UInitOnce nfcInitOnce {};
    249 
    250 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
    251    nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
    252    ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
    253 }
    254 
    255 const Norm2AllModes *
    256 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
    257    if(U_FAILURE(errorCode)) { return nullptr; }
    258    umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
    259    return nfcSingleton;
    260 }
    261 
    262 const Normalizer2 *
    263 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
    264    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    265    return allModes!=nullptr ? &allModes->comp : nullptr;
    266 }
    267 
    268 const Normalizer2 *
    269 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
    270    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    271    return allModes!=nullptr ? &allModes->decomp : nullptr;
    272 }
    273 
    274 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
    275    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    276    return allModes!=nullptr ? &allModes->fcd : nullptr;
    277 }
    278 
    279 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
    280    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    281    return allModes!=nullptr ? &allModes->fcc : nullptr;
    282 }
    283 
    284 const Normalizer2Impl *
    285 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
    286    const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
    287    return allModes!=nullptr ? allModes->impl : nullptr;
    288 }
    289 #endif  // NORM2_HARDCODE_NFC_DATA
    290 
    291 U_CDECL_BEGIN
    292 
    293 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
    294    delete noopSingleton;
    295    noopSingleton = nullptr;
    296    noopInitOnce.reset(); 
    297 #if NORM2_HARDCODE_NFC_DATA
    298    delete nfcSingleton;
    299    nfcSingleton = nullptr;
    300    nfcInitOnce.reset(); 
    301 #endif
    302    return true;
    303 }
    304 
    305 U_CDECL_END
    306 
    307 U_NAMESPACE_END
    308 
    309 // C API ------------------------------------------------------------------- ***
    310 
    311 U_NAMESPACE_USE
    312 
    313 U_CAPI const UNormalizer2 * U_EXPORT2
    314 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
    315    return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
    316 }
    317 
    318 U_CAPI const UNormalizer2 * U_EXPORT2
    319 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
    320    return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
    321 }
    322 
    323 U_CAPI void U_EXPORT2
    324 unorm2_close(UNormalizer2 *norm2) {
    325    delete (Normalizer2 *)norm2;
    326 }
    327 
    328 U_CAPI int32_t U_EXPORT2
    329 unorm2_normalize(const UNormalizer2 *norm2,
    330                 const char16_t *src, int32_t length,
    331                 char16_t *dest, int32_t capacity,
    332                 UErrorCode *pErrorCode) {
    333    if(U_FAILURE(*pErrorCode)) {
    334        return 0;
    335    }
    336    if( (src==nullptr ? length!=0 : length<-1) ||
    337        (dest==nullptr ? capacity!=0 : capacity<0) ||
    338        (src==dest && src!=nullptr)
    339    ) {
    340        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    341        return 0;
    342    }
    343    UnicodeString destString(dest, 0, capacity);
    344    // length==0: Nothing to do, and n2wi->normalize(nullptr, nullptr, buffer, ...) would crash.
    345    if(length!=0) {
    346        const Normalizer2 *n2=(const Normalizer2 *)norm2;
    347        const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
    348        if(n2wi!=nullptr) {
    349            // Avoid duplicate argument checking and support NUL-terminated src.
    350            ReorderingBuffer buffer(n2wi->impl, destString);
    351            if(buffer.init(length, *pErrorCode)) {
    352                n2wi->normalize(src, length>=0 ? src+length : nullptr, buffer, *pErrorCode);
    353            }
    354        } else {
    355            UnicodeString srcString(length<0, src, length);
    356            n2->normalize(srcString, destString, *pErrorCode);
    357        }
    358    }
    359    return destString.extract(dest, capacity, *pErrorCode);
    360 }
    361 
    362 static int32_t
    363 normalizeSecondAndAppend(const UNormalizer2 *norm2,
    364                         char16_t *first, int32_t firstLength, int32_t firstCapacity,
    365                         const char16_t *second, int32_t secondLength,
    366                         UBool doNormalize,
    367                         UErrorCode *pErrorCode) {
    368    if(U_FAILURE(*pErrorCode)) {
    369        return 0;
    370    }
    371    if( (second==nullptr ? secondLength!=0 : secondLength<-1) ||
    372        (first==nullptr ? (firstCapacity!=0 || firstLength!=0) :
    373                       (firstCapacity<0 || firstLength<-1)) ||
    374        (first==second && first!=nullptr)
    375    ) {
    376        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    377        return 0;
    378    }
    379    UnicodeString firstString(first, firstLength, firstCapacity);
    380    firstLength=firstString.length();  // In case it was -1.
    381    // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(nullptr, nullptr, buffer, ...) would crash.
    382    if(secondLength!=0) {
    383        const Normalizer2* n2 = reinterpret_cast<const Normalizer2*>(norm2);
    384        const Normalizer2WithImpl* n2wi = dynamic_cast<const Normalizer2WithImpl*>(n2);
    385        if(n2wi!=nullptr) {
    386            // Avoid duplicate argument checking and support NUL-terminated src.
    387            UnicodeString safeMiddle;
    388            {
    389                ReorderingBuffer buffer(n2wi->impl, firstString);
    390                if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
    391                    n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : nullptr,
    392                                             doNormalize, safeMiddle, buffer, *pErrorCode);
    393                }
    394            }  // The ReorderingBuffer destructor finalizes firstString.
    395            if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
    396                // Restore the modified suffix of the first string.
    397                // This does not restore first[] array contents between firstLength and firstCapacity.
    398                // (That might be uninitialized memory, as far as we know.)
    399                if(first!=nullptr) { /* don't dereference nullptr */
    400                  safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
    401                  if(firstLength<firstCapacity) {
    402                    first[firstLength]=0;  // NUL-terminate in case it was originally.
    403                  }
    404                }
    405            }
    406        } else {
    407            UnicodeString secondString(secondLength<0, second, secondLength);
    408            if(doNormalize) {
    409                n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
    410            } else {
    411                n2->append(firstString, secondString, *pErrorCode);
    412            }
    413        }
    414    }
    415    return firstString.extract(first, firstCapacity, *pErrorCode);
    416 }
    417 
    418 U_CAPI int32_t U_EXPORT2
    419 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
    420                                char16_t *first, int32_t firstLength, int32_t firstCapacity,
    421                                const char16_t *second, int32_t secondLength,
    422                                UErrorCode *pErrorCode) {
    423    return normalizeSecondAndAppend(norm2,
    424                                    first, firstLength, firstCapacity,
    425                                    second, secondLength,
    426                                    true, pErrorCode);
    427 }
    428 
    429 U_CAPI int32_t U_EXPORT2
    430 unorm2_append(const UNormalizer2 *norm2,
    431              char16_t *first, int32_t firstLength, int32_t firstCapacity,
    432              const char16_t *second, int32_t secondLength,
    433              UErrorCode *pErrorCode) {
    434    return normalizeSecondAndAppend(norm2,
    435                                    first, firstLength, firstCapacity,
    436                                    second, secondLength,
    437                                    false, pErrorCode);
    438 }
    439 
    440 U_CAPI int32_t U_EXPORT2
    441 unorm2_getDecomposition(const UNormalizer2 *norm2,
    442                        UChar32 c, char16_t *decomposition, int32_t capacity,
    443                        UErrorCode *pErrorCode) {
    444    if(U_FAILURE(*pErrorCode)) {
    445        return 0;
    446    }
    447    if(decomposition==nullptr ? capacity!=0 : capacity<0) {
    448        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    449        return 0;
    450    }
    451    UnicodeString destString(decomposition, 0, capacity);
    452    if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
    453        return destString.extract(decomposition, capacity, *pErrorCode);
    454    } else {
    455        return -1;
    456    }
    457 }
    458 
    459 U_CAPI int32_t U_EXPORT2
    460 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
    461                           UChar32 c, char16_t *decomposition, int32_t capacity,
    462                           UErrorCode *pErrorCode) {
    463    if(U_FAILURE(*pErrorCode)) {
    464        return 0;
    465    }
    466    if(decomposition==nullptr ? capacity!=0 : capacity<0) {
    467        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    468        return 0;
    469    }
    470    UnicodeString destString(decomposition, 0, capacity);
    471    if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
    472        return destString.extract(decomposition, capacity, *pErrorCode);
    473    } else {
    474        return -1;
    475    }
    476 }
    477 
    478 U_CAPI UChar32 U_EXPORT2
    479 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
    480    return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
    481 }
    482 
    483 U_CAPI uint8_t U_EXPORT2
    484 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
    485    return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
    486 }
    487 
    488 U_CAPI UBool U_EXPORT2
    489 unorm2_isNormalized(const UNormalizer2 *norm2,
    490                    const char16_t *s, int32_t length,
    491                    UErrorCode *pErrorCode) {
    492    if(U_FAILURE(*pErrorCode)) {
    493        return 0;
    494    }
    495    if((s==nullptr && length!=0) || length<-1) {
    496        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    497        return 0;
    498    }
    499    UnicodeString sString(length<0, s, length);
    500    return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
    501 }
    502 
    503 U_CAPI UNormalizationCheckResult U_EXPORT2
    504 unorm2_quickCheck(const UNormalizer2 *norm2,
    505                  const char16_t *s, int32_t length,
    506                  UErrorCode *pErrorCode) {
    507    if(U_FAILURE(*pErrorCode)) {
    508        return UNORM_NO;
    509    }
    510    if((s==nullptr && length!=0) || length<-1) {
    511        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    512        return UNORM_NO;
    513    }
    514    UnicodeString sString(length<0, s, length);
    515    return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
    516 }
    517 
    518 U_CAPI int32_t U_EXPORT2
    519 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
    520                         const char16_t *s, int32_t length,
    521                         UErrorCode *pErrorCode) {
    522    if(U_FAILURE(*pErrorCode)) {
    523        return 0;
    524    }
    525    if((s==nullptr && length!=0) || length<-1) {
    526        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    527        return 0;
    528    }
    529    UnicodeString sString(length<0, s, length);
    530    return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
    531 }
    532 
    533 U_CAPI UBool U_EXPORT2
    534 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
    535    return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
    536 }
    537 
    538 U_CAPI UBool U_EXPORT2
    539 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
    540    return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
    541 }
    542 
    543 U_CAPI UBool U_EXPORT2
    544 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
    545    return ((const Normalizer2 *)norm2)->isInert(c);
    546 }
    547 
    548 // Some properties APIs ---------------------------------------------------- ***
    549 
    550 U_CAPI uint8_t U_EXPORT2
    551 u_getCombiningClass(UChar32 c) {
    552    UErrorCode errorCode=U_ZERO_ERROR;
    553    const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
    554    if(U_SUCCESS(errorCode)) {
    555        return nfd->getCombiningClass(c);
    556    } else {
    557        return 0;
    558    }
    559 }
    560 
    561 U_CFUNC uint16_t
    562 unorm_getFCD16(UChar32 c) {
    563    UErrorCode errorCode=U_ZERO_ERROR;
    564    const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
    565    if(U_SUCCESS(errorCode)) {
    566        return impl->getFCD16(c);
    567    } else {
    568        return 0;
    569    }
    570 }
    571 
    572 #endif  // !UCONFIG_NO_NORMALIZATION