tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

strrepl.cpp (11087B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2002-2012, International Business Machines Corporation
      6 *   and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   01/21/2002  aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/uniset.h"
     18 #include "unicode/utf16.h"
     19 #include "strrepl.h"
     20 #include "rbt_data.h"
     21 #include "util.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UnicodeReplacer::~UnicodeReplacer() {}
     26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
     27 
     28 /**
     29 * Construct a StringReplacer that sets the emits the given output
     30 * text and sets the cursor to the given position.
     31 * @param theOutput text that will replace input text when the
     32 * replace() method is called.  May contain stand-in characters
     33 * that represent nested replacers.
     34 * @param theCursorPos cursor position that will be returned by
     35 * the replace() method
     36 * @param theData transliterator context object that translates
     37 * stand-in characters to UnicodeReplacer objects
     38 */
     39 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     40                               int32_t theCursorPos,
     41                               const TransliterationRuleData* theData) {
     42    output = theOutput;
     43    cursorPos = theCursorPos;
     44    hasCursor = true;
     45    data = theData;
     46    isComplex = true;
     47 }
     48 
     49 /**
     50 * Construct a StringReplacer that sets the emits the given output
     51 * text and does not modify the cursor.
     52 * @param theOutput text that will replace input text when the
     53 * replace() method is called.  May contain stand-in characters
     54 * that represent nested replacers.
     55 * @param theData transliterator context object that translates
     56 * stand-in characters to UnicodeReplacer objects
     57 */
     58 StringReplacer::StringReplacer(const UnicodeString& theOutput,
     59                               const TransliterationRuleData* theData) {
     60    output = theOutput;
     61    cursorPos = 0;
     62    hasCursor = false;
     63    data = theData;
     64    isComplex = true;
     65 }
     66 
     67 /**
     68 * Copy constructor.
     69 */
     70 StringReplacer::StringReplacer(const StringReplacer& other) :
     71    UnicodeFunctor(other),
     72    UnicodeReplacer(other)
     73 {
     74    output = other.output;
     75    cursorPos = other.cursorPos;
     76    hasCursor = other.hasCursor;
     77    data = other.data;
     78    isComplex = other.isComplex;
     79 }
     80 
     81 /**
     82 * Destructor
     83 */
     84 StringReplacer::~StringReplacer() {
     85 }
     86 
     87 /**
     88 * Implement UnicodeFunctor
     89 */
     90 StringReplacer* StringReplacer::clone() const {
     91    return new StringReplacer(*this);
     92 }
     93 
     94 /**
     95 * Implement UnicodeFunctor
     96 */
     97 UnicodeReplacer* StringReplacer::toReplacer() const {
     98  return const_cast<StringReplacer *>(this);
     99 }
    100 
    101 /**
    102 * UnicodeReplacer API
    103 */
    104 int32_t StringReplacer::replace(Replaceable& text,
    105                                int32_t start,
    106                                int32_t limit,
    107                                int32_t& cursor) {
    108    int32_t outLen;
    109    int32_t newStart = 0;
    110 
    111    // NOTE: It should be possible to _always_ run the complex
    112    // processing code; just slower.  If not, then there is a bug
    113    // in the complex processing code.
    114 
    115    // Simple (no nested replacers) Processing Code :
    116    if (!isComplex) {
    117        text.handleReplaceBetween(start, limit, output);
    118        outLen = output.length();
    119 
    120        // Setup default cursor position (for cursorPos within output)
    121        newStart = cursorPos;
    122    }
    123 
    124    // Complex (nested replacers) Processing Code :
    125    else {
    126        /* When there are segments to be copied, use the Replaceable.copy()
    127         * API in order to retain out-of-band data.  Copy everything to the
    128         * end of the string, then copy them back over the key.  This preserves
    129         * the integrity of indices into the key and surrounding context while
    130         * generating the output text.
    131         */
    132        UnicodeString buf;
    133        int32_t oOutput; // offset into 'output'
    134        isComplex = false;
    135 
    136        // The temporary buffer starts at tempStart, and extends
    137        // to destLimit.  The start of the buffer has a single
    138        // character from before the key.  This provides style
    139        // data when addition characters are filled into the
    140        // temporary buffer.  If there is nothing to the left, use
    141        // the non-character U+FFFF, which Replaceable subclasses
    142        // should treat specially as a "no-style character."
    143        // destStart points to the point after the style context
    144        // character, so it is tempStart+1 or tempStart+2.
    145        int32_t tempStart = text.length(); // start of temp buffer
    146        int32_t destStart = tempStart; // copy new text to here
    147        if (start > 0) {
    148            int32_t len = U16_LENGTH(text.char32At(start-1));
    149            text.copy(start-len, start, tempStart);
    150            destStart += len;
    151        } else {
    152            UnicodeString str(static_cast<char16_t>(0xFFFF));
    153            text.handleReplaceBetween(tempStart, tempStart, str);
    154            destStart++;
    155        }
    156        int32_t destLimit = destStart;
    157 
    158        for (oOutput=0; oOutput<output.length(); ) {
    159            if (oOutput == cursorPos) {
    160                // Record the position of the cursor
    161                newStart = destLimit - destStart; // relative to start
    162            }
    163            UChar32 c = output.char32At(oOutput);
    164            UnicodeReplacer* r = data->lookupReplacer(c);
    165            if (r == nullptr) {
    166                // Accumulate straight (non-segment) text.
    167                buf.append(c);
    168            } else {
    169                isComplex = true;
    170 
    171                // Insert any accumulated straight text.
    172                if (buf.length() > 0) {
    173                    text.handleReplaceBetween(destLimit, destLimit, buf);
    174                    destLimit += buf.length();
    175                    buf.truncate(0);
    176                }
    177 
    178                // Delegate output generation to replacer object
    179                int32_t len = r->replace(text, destLimit, destLimit, cursor);
    180                destLimit += len;
    181            }
    182            oOutput += U16_LENGTH(c);
    183        }
    184        // Insert any accumulated straight text.
    185        if (buf.length() > 0) {
    186            text.handleReplaceBetween(destLimit, destLimit, buf);
    187            destLimit += buf.length();
    188        }
    189        if (oOutput == cursorPos) {
    190            // Record the position of the cursor
    191            newStart = destLimit - destStart; // relative to start
    192        }
    193 
    194        outLen = destLimit - destStart;
    195 
    196        // Copy new text to start, and delete it
    197        text.copy(destStart, destLimit, start);
    198        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
    199 
    200        // Delete the old text (the key)
    201        text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
    202    }        
    203 
    204    if (hasCursor) {
    205        // Adjust the cursor for positions outside the key.  These
    206        // refer to code points rather than code units.  If cursorPos
    207        // is within the output string, then use newStart, which has
    208        // already been set above.
    209        if (cursorPos < 0) {
    210            newStart = start;
    211            int32_t n = cursorPos;
    212            // Outside the output string, cursorPos counts code points
    213            while (n < 0 && newStart > 0) {
    214                newStart -= U16_LENGTH(text.char32At(newStart-1));
    215                ++n;
    216            }
    217            newStart += n;
    218        } else if (cursorPos > output.length()) {
    219            newStart = start + outLen;
    220            int32_t n = cursorPos - output.length();
    221            // Outside the output string, cursorPos counts code points
    222            while (n > 0 && newStart < text.length()) {
    223                newStart += U16_LENGTH(text.char32At(newStart));
    224                --n;
    225            }
    226            newStart += n;
    227        } else {
    228            // Cursor is within output string.  It has been set up above
    229            // to be relative to start.
    230            newStart += start;
    231        }
    232 
    233        cursor = newStart;
    234    }
    235 
    236    return outLen;
    237 }
    238 
    239 /**
    240 * UnicodeReplacer API
    241 */
    242 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
    243                                                 UBool escapeUnprintable) const {
    244    rule.truncate(0);
    245    UnicodeString quoteBuf;
    246 
    247    int32_t cursor = cursorPos;
    248 
    249    // Handle a cursor preceding the output
    250    if (hasCursor && cursor < 0) {
    251        while (cursor++ < 0) {
    252            ICU_Utility::appendToRule(rule, static_cast<char16_t>(0x0040) /*@*/, true, escapeUnprintable, quoteBuf);
    253        }
    254        // Fall through and append '|' below
    255    }
    256 
    257    for (int32_t i=0; i<output.length(); ++i) {
    258        if (hasCursor && i == cursor) {
    259            ICU_Utility::appendToRule(rule, static_cast<char16_t>(0x007C) /*|*/, true, escapeUnprintable, quoteBuf);
    260        }
    261        char16_t c = output.charAt(i); // Ok to use 16-bits here
    262 
    263        UnicodeReplacer* r = data->lookupReplacer(c);
    264        if (r == nullptr) {
    265            ICU_Utility::appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
    266        } else {
    267            UnicodeString buf;
    268            r->toReplacerPattern(buf, escapeUnprintable);
    269            buf.insert(0, static_cast<char16_t>(0x20));
    270            buf.append(static_cast<char16_t>(0x20));
    271            ICU_Utility::appendToRule(rule, buf,
    272                                      true, escapeUnprintable, quoteBuf);
    273        }
    274    }
    275 
    276    // Handle a cursor after the output.  Use > rather than >= because
    277    // if cursor == output.length() it is at the end of the output,
    278    // which is the default position, so we need not emit it.
    279    if (hasCursor && cursor > output.length()) {
    280        cursor -= output.length();
    281        while (cursor-- > 0) {
    282            ICU_Utility::appendToRule(rule, static_cast<char16_t>(0x0040) /*@*/, true, escapeUnprintable, quoteBuf);
    283        }
    284        ICU_Utility::appendToRule(rule, static_cast<char16_t>(0x007C) /*|*/, true, escapeUnprintable, quoteBuf);
    285    }
    286    // Flush quoteBuf out to result
    287    ICU_Utility::appendToRule(rule, -1,
    288                              true, escapeUnprintable, quoteBuf);
    289 
    290    return rule;
    291 }
    292 
    293 /**
    294 * Implement UnicodeReplacer
    295 */
    296 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
    297    UChar32 ch;
    298    for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
    299    ch = output.char32At(i);
    300    UnicodeReplacer* r = data->lookupReplacer(ch);
    301    if (r == nullptr) {
    302        toUnionTo.add(ch);
    303    } else {
    304        r->addReplacementSetTo(toUnionTo);
    305    }
    306    }
    307 }
    308 
    309 /**
    310 * UnicodeFunctor API
    311 */
    312 void StringReplacer::setData(const TransliterationRuleData* d) {
    313    data = d;
    314    int32_t i = 0;
    315    while (i<output.length()) {
    316        UChar32 c = output.char32At(i);
    317        UnicodeFunctor* f = data->lookup(c);
    318        if (f != nullptr) {
    319            f->setData(data);
    320        }
    321        i += U16_LENGTH(c);
    322    }
    323 }
    324 
    325 U_NAMESPACE_END
    326 
    327 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    328 
    329 //eof