tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

normlzr.cpp (15145B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *************************************************************************
      5 * COPYRIGHT: 
      6 * Copyright (c) 1996-2012, International Business Machines Corporation and
      7 * others. All Rights Reserved.
      8 *************************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_NORMALIZATION
     14 
     15 #include "unicode/uniset.h"
     16 #include "unicode/unistr.h"
     17 #include "unicode/chariter.h"
     18 #include "unicode/schriter.h"
     19 #include "unicode/uchriter.h"
     20 #include "unicode/normlzr.h"
     21 #include "unicode/utf16.h"
     22 #include "cmemory.h"
     23 #include "normalizer2impl.h"
     24 #include "uprops.h"  // for uniset_getUnicode32Instance()
     25 
     26 #if defined(move32)
     27 // System can define move32 intrinsics, but the char iters define move32 method
     28 // using same undef trick in headers, so undef here to re-enable the method.
     29 #undef move32
     30 #endif
     31 
     32 U_NAMESPACE_BEGIN
     33 
     34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
     35 
     36 //-------------------------------------------------------------------------
     37 // Constructors and other boilerplate
     38 //-------------------------------------------------------------------------
     39 
     40 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
     41    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
     42    text(new StringCharacterIterator(str)),
     43    currentIndex(0), nextIndex(0),
     44    buffer(), bufferPos(0)
     45 {
     46    init();
     47 }
     48 
     49 Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
     50    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
     51    text(new UCharCharacterIterator(str, length)),
     52    currentIndex(0), nextIndex(0),
     53    buffer(), bufferPos(0)
     54 {
     55    init();
     56 }
     57 
     58 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
     59    UObject(), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(mode), fOptions(0),
     60    text(iter.clone()),
     61    currentIndex(0), nextIndex(0),
     62    buffer(), bufferPos(0)
     63 {
     64    init();
     65 }
     66 
     67 Normalizer::Normalizer(const Normalizer &copy) :
     68    UObject(copy), fFilteredNorm2(nullptr), fNorm2(nullptr), fUMode(copy.fUMode), fOptions(copy.fOptions),
     69    text(copy.text->clone()),
     70    currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
     71    buffer(copy.buffer), bufferPos(copy.bufferPos)
     72 {
     73    init();
     74 }
     75 
     76 void
     77 Normalizer::init() {
     78    UErrorCode errorCode=U_ZERO_ERROR;
     79    fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
     80    if(fOptions&UNORM_UNICODE_3_2) {
     81        delete fFilteredNorm2;
     82        fNorm2=fFilteredNorm2=
     83            new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
     84    }
     85    if(U_FAILURE(errorCode)) {
     86        errorCode=U_ZERO_ERROR;
     87        fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
     88    }
     89 }
     90 
     91 Normalizer::~Normalizer()
     92 {
     93    delete fFilteredNorm2;
     94    delete text;
     95 }
     96 
     97 Normalizer* 
     98 Normalizer::clone() const
     99 {
    100    return new Normalizer(*this);
    101 }
    102 
    103 /**
    104 * Generates a hash code for this iterator.
    105 */
    106 int32_t Normalizer::hashCode() const
    107 {
    108    return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
    109 }
    110    
    111 bool Normalizer::operator==(const Normalizer& that) const
    112 {
    113    return
    114        this==&that ||
    115        (fUMode==that.fUMode &&
    116        fOptions==that.fOptions &&
    117        *text==*that.text &&
    118        buffer==that.buffer &&
    119        bufferPos==that.bufferPos &&
    120        nextIndex==that.nextIndex);
    121 }
    122 
    123 //-------------------------------------------------------------------------
    124 // Static utility methods
    125 //-------------------------------------------------------------------------
    126 
    127 void U_EXPORT2
    128 Normalizer::normalize(const UnicodeString& source, 
    129                      UNormalizationMode mode, int32_t options,
    130                      UnicodeString& result, 
    131                      UErrorCode &status) {
    132    if(source.isBogus() || U_FAILURE(status)) {
    133        result.setToBogus();
    134        if(U_SUCCESS(status)) {
    135            status=U_ILLEGAL_ARGUMENT_ERROR;
    136        }
    137    } else {
    138        UnicodeString localDest;
    139        UnicodeString *dest;
    140 
    141        if(&source!=&result) {
    142            dest=&result;
    143        } else {
    144            // the source and result strings are the same object, use a temporary one
    145            dest=&localDest;
    146        }
    147        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    148        if(U_SUCCESS(status)) {
    149            if(options&UNORM_UNICODE_3_2) {
    150                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    151                    normalize(source, *dest, status);
    152            } else {
    153                n2->normalize(source, *dest, status);
    154            }
    155        }
    156        if(dest==&localDest && U_SUCCESS(status)) {
    157            result=*dest;
    158        }
    159    }
    160 }
    161 
    162 void U_EXPORT2
    163 Normalizer::compose(const UnicodeString& source, 
    164                    UBool compat, int32_t options,
    165                    UnicodeString& result, 
    166                    UErrorCode &status) {
    167    normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
    168 }
    169 
    170 void U_EXPORT2
    171 Normalizer::decompose(const UnicodeString& source, 
    172                      UBool compat, int32_t options,
    173                      UnicodeString& result, 
    174                      UErrorCode &status) {
    175    normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
    176 }
    177 
    178 UNormalizationCheckResult
    179 Normalizer::quickCheck(const UnicodeString& source,
    180                       UNormalizationMode mode, int32_t options,
    181                       UErrorCode &status) {
    182    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    183    if(U_SUCCESS(status)) {
    184        if(options&UNORM_UNICODE_3_2) {
    185            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    186                quickCheck(source, status);
    187        } else {
    188            return n2->quickCheck(source, status);
    189        }
    190    } else {
    191        return UNORM_MAYBE;
    192    }
    193 }
    194 
    195 UBool
    196 Normalizer::isNormalized(const UnicodeString& source,
    197                         UNormalizationMode mode, int32_t options,
    198                         UErrorCode &status) {
    199    const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
    200    if(U_SUCCESS(status)) {
    201        if(options&UNORM_UNICODE_3_2) {
    202            return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
    203                isNormalized(source, status);
    204        } else {
    205            return n2->isNormalized(source, status);
    206        }
    207    } else {
    208        return false;
    209    }
    210 }
    211 
    212 UnicodeString & U_EXPORT2
    213 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
    214                        UnicodeString &result,
    215                        UNormalizationMode mode, int32_t options,
    216                        UErrorCode &errorCode) {
    217    if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
    218        result.setToBogus();
    219        if(U_SUCCESS(errorCode)) {
    220            errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    221        }
    222    } else {
    223        UnicodeString localDest;
    224        UnicodeString *dest;
    225 
    226        if(&right!=&result) {
    227            dest=&result;
    228        } else {
    229            // the right and result strings are the same object, use a temporary one
    230            dest=&localDest;
    231        }
    232        *dest=left;
    233        const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
    234        if(U_SUCCESS(errorCode)) {
    235            if(options&UNORM_UNICODE_3_2) {
    236                FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
    237                    append(*dest, right, errorCode);
    238            } else {
    239                n2->append(*dest, right, errorCode);
    240            }
    241        }
    242        if(dest==&localDest && U_SUCCESS(errorCode)) {
    243            result=*dest;
    244        }
    245    }
    246    return result;
    247 }
    248 
    249 //-------------------------------------------------------------------------
    250 // Iteration API
    251 //-------------------------------------------------------------------------
    252 
    253 /**
    254 * Return the current character in the normalized text.
    255 */
    256 UChar32 Normalizer::current() {
    257    if(bufferPos<buffer.length() || nextNormalize()) {
    258        return buffer.char32At(bufferPos);
    259    } else {
    260        return DONE;
    261    }
    262 }
    263 
    264 /**
    265 * Return the next character in the normalized text and advance
    266 * the iteration position by one.  If the end
    267 * of the text has already been reached, {@link #DONE} is returned.
    268 */
    269 UChar32 Normalizer::next() {
    270    if(bufferPos<buffer.length() ||  nextNormalize()) {
    271        UChar32 c=buffer.char32At(bufferPos);
    272        bufferPos+=U16_LENGTH(c);
    273        return c;
    274    } else {
    275        return DONE;
    276    }
    277 }
    278 
    279 /**
    280 * Return the previous character in the normalized text and decrement
    281 * the iteration position by one.  If the beginning
    282 * of the text has already been reached, {@link #DONE} is returned.
    283 */
    284 UChar32 Normalizer::previous() {
    285    if(bufferPos>0 || previousNormalize()) {
    286        UChar32 c=buffer.char32At(bufferPos-1);
    287        bufferPos-=U16_LENGTH(c);
    288        return c;
    289    } else {
    290        return DONE;
    291    }
    292 }
    293 
    294 void Normalizer::reset() {
    295    currentIndex=nextIndex=text->setToStart();
    296    clearBuffer();
    297 }
    298 
    299 void
    300 Normalizer::setIndexOnly(int32_t index) {
    301    text->setIndex(index);  // pins index
    302    currentIndex=nextIndex=text->getIndex();
    303    clearBuffer();
    304 }
    305 
    306 /**
    307 * Return the first character in the normalized text.  This resets
    308 * the <tt>Normalizer's</tt> position to the beginning of the text.
    309 */
    310 UChar32 Normalizer::first() {
    311    reset();
    312    return next();
    313 }
    314 
    315 /**
    316 * Return the last character in the normalized text.  This resets
    317 * the <tt>Normalizer's</tt> position to be just before the
    318 * the input text corresponding to that normalized character.
    319 */
    320 UChar32 Normalizer::last() {
    321    currentIndex=nextIndex=text->setToEnd();
    322    clearBuffer();
    323    return previous();
    324 }
    325 
    326 /**
    327 * Retrieve the current iteration position in the input text that is
    328 * being normalized.  This method is useful in applications such as
    329 * searching, where you need to be able to determine the position in
    330 * the input text that corresponds to a given normalized output character.
    331 * <p>
    332 * <b>Note:</b> This method sets the position in the <em>input</em>, while
    333 * {@link #next} and {@link #previous} iterate through characters in the
    334 * <em>output</em>.  This means that there is not necessarily a one-to-one
    335 * correspondence between characters returned by <tt>next</tt> and
    336 * <tt>previous</tt> and the indices passed to and returned from
    337 * <tt>setIndex</tt> and {@link #getIndex}.
    338 *
    339 */
    340 int32_t Normalizer::getIndex() const {
    341    if(bufferPos<buffer.length()) {
    342        return currentIndex;
    343    } else {
    344        return nextIndex;
    345    }
    346 }
    347 
    348 /**
    349 * Retrieve the index of the start of the input text.  This is the begin index
    350 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
    351 * over which this <tt>Normalizer</tt> is iterating
    352 */
    353 int32_t Normalizer::startIndex() const {
    354    return text->startIndex();
    355 }
    356 
    357 /**
    358 * Retrieve the index of the end of the input text.  This is the end index
    359 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
    360 * over which this <tt>Normalizer</tt> is iterating
    361 */
    362 int32_t Normalizer::endIndex() const {
    363    return text->endIndex();
    364 }
    365 
    366 //-------------------------------------------------------------------------
    367 // Property access methods
    368 //-------------------------------------------------------------------------
    369 
    370 void
    371 Normalizer::setMode(UNormalizationMode newMode) 
    372 {
    373    fUMode = newMode;
    374    init();
    375 }
    376 
    377 UNormalizationMode
    378 Normalizer::getUMode() const
    379 {
    380    return fUMode;
    381 }
    382 
    383 void
    384 Normalizer::setOption(int32_t option, 
    385                      UBool value) 
    386 {
    387    if (value) {
    388        fOptions |= option;
    389    } else {
    390        fOptions &= (~option);
    391    }
    392    init();
    393 }
    394 
    395 UBool
    396 Normalizer::getOption(int32_t option) const
    397 {
    398    return (fOptions & option) != 0;
    399 }
    400 
    401 /**
    402 * Set the input text over which this <tt>Normalizer</tt> will iterate.
    403 * The iteration position is set to the beginning of the input text.
    404 */
    405 void
    406 Normalizer::setText(const UnicodeString& newText, 
    407                    UErrorCode &status)
    408 {
    409    if (U_FAILURE(status)) {
    410        return;
    411    }
    412    CharacterIterator *newIter = new StringCharacterIterator(newText);
    413    if (newIter == nullptr) {
    414        status = U_MEMORY_ALLOCATION_ERROR;
    415        return;
    416    }
    417    delete text;
    418    text = newIter;
    419    reset();
    420 }
    421 
    422 /**
    423 * Set the input text over which this <tt>Normalizer</tt> will iterate.
    424 * The iteration position is set to the beginning of the string.
    425 */
    426 void
    427 Normalizer::setText(const CharacterIterator& newText, 
    428                    UErrorCode &status) 
    429 {
    430    if (U_FAILURE(status)) {
    431        return;
    432    }
    433    CharacterIterator *newIter = newText.clone();
    434    if (newIter == nullptr) {
    435        status = U_MEMORY_ALLOCATION_ERROR;
    436        return;
    437    }
    438    delete text;
    439    text = newIter;
    440    reset();
    441 }
    442 
    443 void
    444 Normalizer::setText(ConstChar16Ptr newText,
    445                    int32_t length,
    446                    UErrorCode &status)
    447 {
    448    if (U_FAILURE(status)) {
    449        return;
    450    }
    451    CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
    452    if (newIter == nullptr) {
    453        status = U_MEMORY_ALLOCATION_ERROR;
    454        return;
    455    }
    456    delete text;
    457    text = newIter;
    458    reset();
    459 }
    460 
    461 /**
    462 * Copies the text under iteration into the UnicodeString referred to by "result".
    463 * @param result Receives a copy of the text under iteration.
    464 */
    465 void
    466 Normalizer::getText(UnicodeString&  result) 
    467 {
    468    text->getText(result);
    469 }
    470 
    471 //-------------------------------------------------------------------------
    472 // Private utility methods
    473 //-------------------------------------------------------------------------
    474 
    475 void Normalizer::clearBuffer() {
    476    buffer.remove();
    477    bufferPos=0;
    478 }
    479 
    480 UBool
    481 Normalizer::nextNormalize() {
    482    clearBuffer();
    483    currentIndex=nextIndex;
    484    text->setIndex(nextIndex);
    485    if(!text->hasNext()) {
    486        return false;
    487    }
    488    // Skip at least one character so we make progress.
    489    UnicodeString segment(text->next32PostInc());
    490    while(text->hasNext()) {
    491        UChar32 c;
    492        if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
    493            text->move32(-1, CharacterIterator::kCurrent);
    494            break;
    495        }
    496        segment.append(c);
    497    }
    498    nextIndex=text->getIndex();
    499    UErrorCode errorCode=U_ZERO_ERROR;
    500    fNorm2->normalize(segment, buffer, errorCode);
    501    return U_SUCCESS(errorCode) && !buffer.isEmpty();
    502 }
    503 
    504 UBool
    505 Normalizer::previousNormalize() {
    506    clearBuffer();
    507    nextIndex=currentIndex;
    508    text->setIndex(currentIndex);
    509    if(!text->hasPrevious()) {
    510        return false;
    511    }
    512    UnicodeString segment;
    513    while(text->hasPrevious()) {
    514        UChar32 c=text->previous32();
    515        segment.insert(0, c);
    516        if(fNorm2->hasBoundaryBefore(c)) {
    517            break;
    518        }
    519    }
    520    currentIndex=text->getIndex();
    521    UErrorCode errorCode=U_ZERO_ERROR;
    522    fNorm2->normalize(segment, buffer, errorCode);
    523    bufferPos=buffer.length();
    524    return U_SUCCESS(errorCode) && !buffer.isEmpty();
    525 }
    526 
    527 U_NAMESPACE_END
    528 
    529 #endif /* #if !UCONFIG_NO_NORMALIZATION */