tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbt_set.cpp (16003B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/unistr.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/utf16.h"
     20 #include "rbt_set.h"
     21 #include "rbt_rule.h"
     22 #include "cmemory.h"
     23 #include "putilimp.h"
     24 
     25 U_CDECL_BEGIN
     26 static void U_CALLCONV _deleteRule(void *rule) {
     27    delete (icu::TransliterationRule *)rule;
     28 }
     29 U_CDECL_END
     30 
     31 //----------------------------------------------------------------------
     32 // BEGIN Debugging support
     33 //----------------------------------------------------------------------
     34 
     35 // #define DEBUG_RBT
     36 
     37 #ifdef DEBUG_RBT
     38 #include <stdio.h>
     39 #include "charstr.h"
     40 
     41 /**
     42 * @param appendTo result is appended to this param.
     43 * @param input the string being transliterated
     44 * @param pos the index struct
     45 */
     46 static UnicodeString& _formatInput(UnicodeString &appendTo,
     47                                   const UnicodeString& input,
     48                                   const UTransPosition& pos) {
     49    // Output a string of the form aaa{bbb|ccc|ddd}eee, where
     50    // the {} indicate the context start and limit, and the ||
     51    // indicate the start and limit.
     52    if (0 <= pos.contextStart &&
     53        pos.contextStart <= pos.start &&
     54        pos.start <= pos.limit &&
     55        pos.limit <= pos.contextLimit &&
     56        pos.contextLimit <= input.length()) {
     57 
     58        UnicodeString a, b, c, d, e;
     59        input.extractBetween(0, pos.contextStart, a);
     60        input.extractBetween(pos.contextStart, pos.start, b);
     61        input.extractBetween(pos.start, pos.limit, c);
     62        input.extractBetween(pos.limit, pos.contextLimit, d);
     63        input.extractBetween(pos.contextLimit, input.length(), e);
     64        appendTo.append(a).append((char16_t)123/*{*/).append(b).
     65            append((char16_t)124/*|*/).append(c).append((char16_t)124/*|*/).append(d).
     66            append((char16_t)125/*}*/).append(e);
     67    } else {
     68        appendTo.append("INVALID UTransPosition");
     69        //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
     70        //                pos.contextStart + ", s=" + pos.start + ", l=" +
     71        //                pos.limit + ", cl=" + pos.contextLimit + "} on " +
     72        //                input);
     73    }
     74    return appendTo;
     75 }
     76 
     77 // Append a hex string to the target
     78 UnicodeString& _appendHex(uint32_t number,
     79                          int32_t digits,
     80                          UnicodeString& target) {
     81    static const char16_t digitString[] = {
     82        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
     83        0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
     84    };
     85    while (digits--) {
     86        target += digitString[(number >> (digits*4)) & 0xF];
     87    }
     88    return target;
     89 }
     90 
     91 // Replace nonprintable characters with unicode escapes
     92 UnicodeString& _escape(const UnicodeString &source,
     93                       UnicodeString &target) {
     94    for (int32_t i = 0; i < source.length(); ) {
     95        UChar32 ch = source.char32At(i);
     96        i += U16_LENGTH(ch);
     97        if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {
     98            if (ch <= 0xFFFF) {
     99                target += "\\u";
    100                _appendHex(ch, 4, target);
    101            } else {
    102                target += "\\U";
    103                _appendHex(ch, 8, target);
    104            }
    105        } else {
    106            target += ch;
    107        }
    108    }
    109    return target;
    110 }
    111 
    112 inline void _debugOut(const char* msg, TransliterationRule* rule,
    113                      const Replaceable& theText, UTransPosition& pos) {
    114    UnicodeString buf(msg, "");
    115    if (rule) {
    116        UnicodeString r;
    117        rule->toRule(r, true);
    118        buf.append((char16_t)32).append(r);
    119    }
    120    buf.append(UnicodeString(" => ", ""));
    121    UnicodeString* text = (UnicodeString*)&theText;
    122    _formatInput(buf, *text, pos);
    123    UnicodeString esc;
    124    _escape(buf, esc);
    125    CharString cbuf(esc);
    126    printf("%s\n", (const char*) cbuf);
    127 }
    128 
    129 #else
    130 #define _debugOut(msg, rule, theText, pos)
    131 #endif
    132 
    133 //----------------------------------------------------------------------
    134 // END Debugging support
    135 //----------------------------------------------------------------------
    136 
    137 // Fill the precontext and postcontext with the patterns of the rules
    138 // that are masking one another.
    139 static void maskingError(const icu::TransliterationRule& rule1,
    140                         const icu::TransliterationRule& rule2,
    141                         UParseError& parseError) {
    142    icu::UnicodeString r;
    143    int32_t len;
    144 
    145    parseError.line = parseError.offset = -1;
    146    
    147    // for pre-context
    148    rule1.toRule(r, false);
    149    len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
    150    r.extract(0, len, parseError.preContext);
    151    parseError.preContext[len] = 0;   
    152    
    153    //for post-context
    154    r.truncate(0);
    155    rule2.toRule(r, false);
    156    len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
    157    r.extract(0, len, parseError.postContext);
    158    parseError.postContext[len] = 0;   
    159 }
    160 
    161 U_NAMESPACE_BEGIN
    162 
    163 /**
    164 * Construct a new empty rule set.
    165 */
    166 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) :
    167        UMemory(), ruleVector(nullptr), rules(nullptr), index {}, maxContextLength(0) {
    168    LocalPointer<UVector> lpRuleVector(new UVector(_deleteRule, nullptr, status), status);
    169    if (U_FAILURE(status)) {
    170        return;
    171    }
    172    ruleVector = lpRuleVector.orphan();
    173 }
    174 
    175 /**
    176 * Copy constructor.
    177 */
    178 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
    179    UMemory(other),
    180    ruleVector(nullptr),
    181    rules(nullptr),
    182    maxContextLength(other.maxContextLength) {
    183 
    184    int32_t i, len;
    185    uprv_memcpy(index, other.index, sizeof(index));
    186    UErrorCode status = U_ZERO_ERROR;
    187    LocalPointer<UVector> lpRuleVector(new UVector(_deleteRule, nullptr, status), status);
    188    if (U_FAILURE(status)) {
    189        return;
    190    }
    191    ruleVector = lpRuleVector.orphan();
    192    if (other.ruleVector != nullptr && U_SUCCESS(status)) {
    193        len = other.ruleVector->size();
    194        for (i=0; i<len && U_SUCCESS(status); ++i) {
    195            LocalPointer<TransliterationRule> tempTranslitRule(
    196                new TransliterationRule(*static_cast<TransliterationRule*>(other.ruleVector->elementAt(i))), status);
    197            ruleVector->adoptElement(tempTranslitRule.orphan(), status);
    198        }
    199    }
    200    if (other.rules != nullptr && U_SUCCESS(status)) {
    201        UParseError p;
    202        freeze(p, status);
    203    }
    204 }
    205 
    206 /**
    207 * Destructor.
    208 */
    209 TransliterationRuleSet::~TransliterationRuleSet() {
    210    delete ruleVector; // This deletes the contained rules
    211    uprv_free(rules);
    212 }
    213 
    214 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {
    215    /**
    216     * We assume that the ruleset has already been frozen.
    217     */
    218    int32_t len = index[256]; // see freeze()
    219    for (int32_t i=0; i<len; ++i) {
    220        rules[i]->setData(d);
    221    }
    222 }
    223 
    224 /**
    225 * Return the maximum context length.
    226 * @return the length of the longest preceding context.
    227 */
    228 int32_t TransliterationRuleSet::getMaximumContextLength() const {
    229    return maxContextLength;
    230 }
    231 
    232 /**
    233 * Add a rule to this set.  Rules are added in order, and order is
    234 * significant.  The last call to this method must be followed by
    235 * a call to <code>freeze()</code> before the rule set is used.
    236 *
    237 * <p>If freeze() has already been called, calling addRule()
    238 * unfreezes the rules, and freeze() must be called again.
    239 *
    240 * @param adoptedRule the rule to add
    241 */
    242 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
    243                                     UErrorCode& status) {
    244    LocalPointer<TransliterationRule> lpAdoptedRule(adoptedRule);
    245    ruleVector->adoptElement(lpAdoptedRule.orphan(), status);
    246    if (U_FAILURE(status)) {
    247        return;
    248    }
    249 
    250    int32_t len;
    251    if ((len = adoptedRule->getContextLength()) > maxContextLength) {
    252        maxContextLength = len;
    253    }
    254 
    255    uprv_free(rules);
    256    rules = nullptr;
    257 }
    258 
    259 /**
    260 * Check this for masked rules and index it to optimize performance.
    261 * The sequence of operations is: (1) add rules to a set using
    262 * <code>addRule()</code>; (2) freeze the set using
    263 * <code>freeze()</code>; (3) use the rule set.  If
    264 * <code>addRule()</code> is called after calling this method, it
    265 * invalidates this object, and this method must be called again.
    266 * That is, <code>freeze()</code> may be called multiple times,
    267 * although for optimal performance it shouldn't be.
    268 */
    269 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
    270    /* Construct the rule array and index table.  We reorder the
    271     * rules by sorting them into 256 bins.  Each bin contains all
    272     * rules matching the index value for that bin.  A rule
    273     * matches an index value if string whose first key character
    274     * has a low byte equal to the index value can match the rule.
    275     *
    276     * Each bin contains zero or more rules, in the same order
    277     * they were found originally.  However, the total rules in
    278     * the bins may exceed the number in the original vector,
    279     * since rules that have a variable as their first key
    280     * character will generally fall into more than one bin.
    281     *
    282     * That is, each bin contains all rules that either have that
    283     * first index value as their first key character, or have
    284     * a set containing the index value as their first character.
    285     */
    286    int32_t n = ruleVector->size();
    287    int32_t j;
    288    int16_t x;
    289    UVector v(2*n, status); // heuristic; adjust as needed
    290 
    291    if (U_FAILURE(status)) {
    292        return;
    293    }
    294 
    295    /* Precompute the index values.  This saves a LOT of time.
    296     * Be careful not to call malloc(0).
    297     */
    298    int16_t* indexValue = static_cast<int16_t*>(uprv_malloc(sizeof(int16_t) * (n > 0 ? n : 1)));
    299    /* test for nullptr */
    300    if (indexValue == nullptr) {
    301        status = U_MEMORY_ALLOCATION_ERROR;
    302        return;
    303    }
    304    for (j=0; j<n; ++j) {
    305        TransliterationRule* r = static_cast<TransliterationRule*>(ruleVector->elementAt(j));
    306        indexValue[j] = r->getIndexValue();
    307    }
    308    for (x=0; x<256; ++x) {
    309        index[x] = v.size();
    310        for (j=0; j<n; ++j) {
    311            if (indexValue[j] >= 0) {
    312                if (indexValue[j] == x) {
    313                    v.addElement(ruleVector->elementAt(j), status);
    314                }
    315            } else {
    316                // If the indexValue is < 0, then the first key character is
    317                // a set, and we must use the more time-consuming
    318                // matchesIndexValue check.  In practice this happens
    319                // rarely, so we seldom treat this code path.
    320                TransliterationRule* r = static_cast<TransliterationRule*>(ruleVector->elementAt(j));
    321                if (r->matchesIndexValue(static_cast<uint8_t>(x))) {
    322                    v.addElement(r, status);
    323                }
    324            }
    325        }
    326    }
    327    uprv_free(indexValue);
    328    index[256] = v.size();
    329    if (U_FAILURE(status)) {
    330        return;
    331    }
    332 
    333    /* Freeze things into an array.
    334     */
    335    uprv_free(rules); // Contains alias pointers
    336 
    337    /* You can't do malloc(0)! */
    338    if (v.size() == 0) {
    339        rules = nullptr;
    340        return;
    341    }
    342    rules = static_cast<TransliterationRule**>(uprv_malloc(v.size() * sizeof(TransliterationRule*)));
    343    /* test for nullptr */
    344    if (rules == nullptr) {
    345        status = U_MEMORY_ALLOCATION_ERROR;
    346        return;
    347    }
    348    for (j=0; j<v.size(); ++j) {
    349        rules[j] = static_cast<TransliterationRule*>(v.elementAt(j));
    350    }
    351 
    352    // TODO Add error reporting that indicates the rules that
    353    //      are being masked.
    354    //UnicodeString errors;
    355 
    356    /* Check for masking.  This is MUCH faster than our old check,
    357     * which was each rule against each following rule, since we
    358     * only have to check for masking within each bin now.  It's
    359     * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
    360     * count, and n2 is the per-bin rule count.  But n2<<n1, so
    361     * it's a big win.
    362     */
    363    for (x=0; x<256; ++x) {
    364        for (j=index[x]; j<index[x+1]-1; ++j) {
    365            TransliterationRule* r1 = rules[j];
    366            for (int32_t k=j+1; k<index[x+1]; ++k) {
    367                TransliterationRule* r2 = rules[k];
    368                if (r1->masks(*r2)) {
    369 //|                 if (errors == null) {
    370 //|                     errors = new StringBuffer();
    371 //|                 } else {
    372 //|                     errors.append("\n");
    373 //|                 }
    374 //|                 errors.append("Rule " + r1 + " masks " + r2);
    375                    status = U_RULE_MASK_ERROR;
    376                    maskingError(*r1, *r2, parseError);
    377                    return;
    378                }
    379            }
    380        }
    381    }
    382 
    383    //if (errors != null) {
    384    //    throw new IllegalArgumentException(errors.toString());
    385    //}
    386 }
    387 
    388 /**
    389 * Transliterate the given text with the given UTransPosition
    390 * indices.  Return true if the transliteration should continue
    391 * or false if it should halt (because of a U_PARTIAL_MATCH match).
    392 * Note that false is only ever returned if isIncremental is true.
    393 * @param text the text to be transliterated
    394 * @param pos the position indices, which will be updated
    395 * @param incremental if true, assume new text may be inserted
    396 * at index.limit, and return false if there is a partial match.
    397 * @return true unless a U_PARTIAL_MATCH has been obtained,
    398 * indicating that transliteration should stop until more text
    399 * arrives.
    400 */
    401 UBool TransliterationRuleSet::transliterate(Replaceable& text,
    402                                            UTransPosition& pos,
    403                                            UBool incremental) {
    404    int16_t indexByte = static_cast<int16_t>(text.char32At(pos.start) & 0xFF);
    405    for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
    406        UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
    407        switch (m) {
    408        case U_MATCH:
    409            _debugOut("match", rules[i], text, pos);
    410            return true;
    411        case U_PARTIAL_MATCH:
    412            _debugOut("partial match", rules[i], text, pos);
    413            return false;
    414        default: /* Ram: added default to make GCC happy */
    415            break;
    416        }
    417    }
    418    // No match or partial match from any rule
    419    pos.start += U16_LENGTH(text.char32At(pos.start));
    420    _debugOut("no match", nullptr, text, pos);
    421    return true;
    422 }
    423 
    424 /**
    425 * Create rule strings that represents this rule set.
    426 */
    427 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
    428                                               UBool escapeUnprintable) const {
    429    int32_t i;
    430    int32_t count = ruleVector->size();
    431    ruleSource.truncate(0);
    432    for (i=0; i<count; ++i) {
    433        if (i != 0) {
    434            ruleSource.append(static_cast<char16_t>(0x000A) /*\n*/);
    435        }
    436        TransliterationRule *r =
    437            static_cast<TransliterationRule*>(ruleVector->elementAt(i));
    438        r->toRule(ruleSource, escapeUnprintable);
    439    }
    440    return ruleSource;
    441 }
    442 
    443 /**
    444 * Return the set of all characters that may be modified
    445 * (getTarget=false) or emitted (getTarget=true) by this set.
    446 */
    447 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
    448                               UBool getTarget) const
    449 {
    450    result.clear();
    451    int32_t count = ruleVector->size();
    452    for (int32_t i=0; i<count; ++i) {
    453        TransliterationRule* r =
    454            static_cast<TransliterationRule*>(ruleVector->elementAt(i));
    455        if (getTarget) {
    456            r->addTargetSetTo(result);
    457        } else {
    458            r->addSourceSetTo(result);
    459        }
    460    }
    461    return result;
    462 }
    463 
    464 U_NAMESPACE_END
    465 
    466 #endif /* #if !UCONFIG_NO_TRANSLITERATION */