tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nortrans.cpp (6500B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2001-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   07/03/01    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/normalizer2.h"
     18 #include "unicode/utf16.h"
     19 #include "cstring.h"
     20 #include "nortrans.h"
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
     25 
     26 static inline Transliterator::Token cstrToken(const char *s) {
     27    return Transliterator::pointerToken((void *)s);
     28 }
     29 
     30 /**
     31 * System registration hook.
     32 */
     33 void NormalizationTransliterator::registerIDs() {
     34    // In the Token, the byte after the NUL is the UNormalization2Mode.
     35    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
     36                                     _create, cstrToken("nfc\0\0"));
     37    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
     38                                     _create, cstrToken("nfkc\0\0"));
     39    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
     40                                     _create, cstrToken("nfc\0\1"));
     41    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
     42                                     _create, cstrToken("nfkc\0\1"));
     43    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
     44                                     _create, cstrToken("nfc\0\2"));
     45    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
     46                                     _create, cstrToken("nfc\0\3"));
     47    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
     48                                            UNICODE_STRING_SIMPLE("NFD"), true);
     49    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
     50                                            UNICODE_STRING_SIMPLE("NFKD"), true);
     51    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
     52                                            UNICODE_STRING_SIMPLE("NFD"), false);
     53    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
     54                                            UNICODE_STRING_SIMPLE("FCD"), false);
     55 }
     56 
     57 /**
     58 * Factory methods
     59 */
     60 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
     61                                                     Token context) {
     62    const char* name = static_cast<const char*>(context.pointer);
     63    UNormalization2Mode mode = static_cast<UNormalization2Mode>(uprv_strchr(name, 0)[1]);
     64    UErrorCode errorCode = U_ZERO_ERROR;
     65    const Normalizer2 *norm2 = Normalizer2::getInstance(nullptr, name, mode, errorCode);
     66    if(U_SUCCESS(errorCode)) {
     67        return new NormalizationTransliterator(ID, *norm2);
     68    } else {
     69        return nullptr;
     70    }
     71 }
     72 
     73 /**
     74 * Constructs a transliterator.
     75 */
     76 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
     77                                                         const Normalizer2 &norm2) :
     78    Transliterator(id, nullptr), fNorm2(norm2) {}
     79 
     80 /**
     81 * Destructor.
     82 */
     83 NormalizationTransliterator::~NormalizationTransliterator() {
     84 }
     85 
     86 /**
     87 * Copy constructor.
     88 */
     89 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
     90    Transliterator(o), fNorm2(o.fNorm2) {}
     91 
     92 /**
     93 * Transliterator API.
     94 */
     95 NormalizationTransliterator* NormalizationTransliterator::clone() const {
     96    return new NormalizationTransliterator(*this);
     97 }
     98 
     99 /**
    100 * Implements {@link Transliterator#handleTransliterate}.
    101 */
    102 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
    103                                                      UBool isIncremental) const {
    104    // start and limit of the input range
    105    int32_t start = offsets.start;
    106    int32_t limit = offsets.limit;
    107    if(start >= limit) {
    108        return;
    109    }
    110 
    111    /*
    112     * Normalize as short chunks at a time as possible even in
    113     * bulk mode, so that styled text is minimally disrupted.
    114     * In incremental mode, a chunk that ends with offsets.limit
    115     * must not be normalized.
    116     *
    117     * If it was known that the input text is not styled, then
    118     * a bulk mode normalization could look like this:
    119 
    120    UnicodeString input, normalized;
    121    int32_t length = limit - start;
    122    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    123    input.releaseBuffer(length);
    124 
    125    UErrorCode status = U_ZERO_ERROR;
    126    fNorm2.normalize(input, normalized, status);
    127 
    128    text.handleReplaceBetween(start, limit, normalized);
    129 
    130    int32_t delta = normalized.length() - length;
    131    offsets.contextLimit += delta;
    132    offsets.limit += delta;
    133    offsets.start = limit + delta;
    134 
    135     */
    136    UErrorCode errorCode = U_ZERO_ERROR;
    137    UnicodeString segment;
    138    UnicodeString normalized;
    139    UChar32 c = text.char32At(start);
    140    do {
    141        int32_t prev = start;
    142        // Skip at least one character so we make progress.
    143        // c holds the character at start.
    144        segment.remove();
    145        do {
    146            segment.append(c);
    147            start += U16_LENGTH(c);
    148        } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
    149        if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
    150            // stop in incremental mode when we reach the input limit
    151            // in case there are additional characters that could change the
    152            // normalization result
    153            start=prev;
    154            break;
    155        }
    156        fNorm2.normalize(segment, normalized, errorCode);
    157        if(U_FAILURE(errorCode)) {
    158            break;
    159        }
    160        if(segment != normalized) {
    161            // replace the input chunk with its normalized form
    162            text.handleReplaceBetween(prev, start, normalized);
    163 
    164            // update all necessary indexes accordingly
    165            int32_t delta = normalized.length() - (start - prev);
    166            start += delta;
    167            limit += delta;
    168        }
    169    } while(start < limit);
    170 
    171    offsets.start = start;
    172    offsets.contextLimit += limit - offsets.limit;
    173    offsets.limit = limit;
    174 }
    175 
    176 U_NAMESPACE_END
    177 
    178 #endif /* #if !UCONFIG_NO_TRANSLITERATION */