tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

rbt.cpp (10647B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/rep.h"
     18 #include "unicode/uniset.h"
     19 #include "rbt_pars.h"
     20 #include "rbt_data.h"
     21 #include "rbt_rule.h"
     22 #include "rbt.h"
     23 #include "mutex.h"
     24 #include "umutex.h"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
     29 
     30 static Replaceable *gLockedText = nullptr;
     31 
     32 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
     33                                         UTransDirection direction,
     34                                         UParseError& parseError,
     35                                         UErrorCode& status) {
     36    fData = nullptr;
     37    isDataOwned = true;
     38    if (U_FAILURE(status)) {
     39        return;
     40    }
     41 
     42    TransliteratorParser parser(status);
     43    parser.parse(rules, direction, parseError, status);
     44    if (U_FAILURE(status)) {
     45        return;
     46    }
     47 
     48    if (parser.idBlockVector.size() != 0 ||
     49        parser.compoundFilter != nullptr ||
     50        parser.dataVector.size() == 0) {
     51        status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
     52        return;
     53    }
     54 
     55    fData = static_cast<TransliterationRuleData*>(parser.dataVector.orphanElementAt(0));
     56    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
     57 }
     58 
     59 /**
     60 * Constructs a new transliterator from the given rules.
     61 * @param id            the id for the transliterator.
     62 * @param rules         rules, separated by ';'
     63 * @param direction     either FORWARD or REVERSE.
     64 * @param adoptedFilter the filter for this transliterator.
     65 * @param parseError    Struct to receive information on position 
     66 *                      of error if an error is encountered
     67 * @param status        Output param set to success/failure code.
     68 * @exception IllegalArgumentException if rules are malformed
     69 * or direction is invalid.
     70 */
     71 RuleBasedTransliterator::RuleBasedTransliterator(
     72                            const UnicodeString& id,
     73                            const UnicodeString& rules,
     74                            UTransDirection direction,
     75                            UnicodeFilter* adoptedFilter,
     76                            UParseError& parseError,
     77                            UErrorCode& status) :
     78    Transliterator(id, adoptedFilter) {
     79    _construct(rules, direction,parseError,status);
     80 }
     81 
     82 /**
     83 * Constructs a new transliterator from the given rules.
     84 * @param id            the id for the transliterator.
     85 * @param rules         rules, separated by ';'
     86 * @param direction     either FORWARD or REVERSE.
     87 * @param adoptedFilter the filter for this transliterator.
     88 * @param status        Output param set to success/failure code.
     89 * @exception IllegalArgumentException if rules are malformed
     90 * or direction is invalid.
     91 */
     92 /*RuleBasedTransliterator::RuleBasedTransliterator(
     93                            const UnicodeString& id,
     94                            const UnicodeString& rules,
     95                            UTransDirection direction,
     96                            UnicodeFilter* adoptedFilter,
     97                            UErrorCode& status) :
     98    Transliterator(id, adoptedFilter) {
     99    UParseError parseError;
    100    _construct(rules, direction,parseError, status);
    101 }*/
    102 
    103 /**
    104 * Convenience constructor with no filter.
    105 */
    106 /*RuleBasedTransliterator::RuleBasedTransliterator(
    107                            const UnicodeString& id,
    108                            const UnicodeString& rules,
    109                            UTransDirection direction,
    110                            UErrorCode& status) :
    111    Transliterator(id, 0) {
    112    UParseError parseError;
    113    _construct(rules, direction,parseError, status);
    114 }*/
    115 
    116 /**
    117 * Convenience constructor with no filter and FORWARD direction.
    118 */
    119 /*RuleBasedTransliterator::RuleBasedTransliterator(
    120                            const UnicodeString& id,
    121                            const UnicodeString& rules,
    122                            UErrorCode& status) :
    123    Transliterator(id, 0) {
    124    UParseError parseError;
    125    _construct(rules, UTRANS_FORWARD, parseError, status);
    126 }*/
    127 
    128 /**
    129 * Convenience constructor with FORWARD direction.
    130 */
    131 /*RuleBasedTransliterator::RuleBasedTransliterator(
    132                            const UnicodeString& id,
    133                            const UnicodeString& rules,
    134                            UnicodeFilter* adoptedFilter,
    135                            UErrorCode& status) :
    136    Transliterator(id, adoptedFilter) {
    137    UParseError parseError;
    138    _construct(rules, UTRANS_FORWARD,parseError, status);
    139 }*/
    140 
    141 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    142                                 const TransliterationRuleData* theData,
    143                                 UnicodeFilter* adoptedFilter) :
    144    Transliterator(id, adoptedFilter),
    145    fData(const_cast<TransliterationRuleData*>(theData)), // cast away const
    146    isDataOwned(false) {
    147    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    148 }
    149 
    150 /**
    151 * Internal constructor.
    152 */
    153 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
    154                                                 TransliterationRuleData* theData,
    155                                                 UBool isDataAdopted) :
    156    Transliterator(id, nullptr),
    157    fData(theData),
    158    isDataOwned(isDataAdopted) {
    159    setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
    160 }
    161 
    162 /**
    163 * Copy constructor.
    164 */
    165 RuleBasedTransliterator::RuleBasedTransliterator(
    166        const RuleBasedTransliterator& other) :
    167    Transliterator(other), fData(other.fData),
    168    isDataOwned(other.isDataOwned) {
    169 
    170    // The data object may or may not be owned.  If it is not owned we
    171    // share it; it is invariant.  If it is owned, it's still
    172    // invariant, but we need to copy it to prevent double-deletion.
    173    // If this becomes a performance issue (if people do a lot of RBT
    174    // copying -- unlikely) we can reference count the data object.
    175 
    176    // Only do a deep copy if this is owned data, that is, data that
    177    // will be later deleted.  System transliterators contain
    178    // non-owned data.
    179    if (isDataOwned) {
    180        fData = new TransliterationRuleData(*other.fData);
    181    }
    182 }
    183 
    184 /**
    185 * Destructor.
    186 */
    187 RuleBasedTransliterator::~RuleBasedTransliterator() {
    188    // Delete the data object only if we own it.
    189    if (isDataOwned) {
    190        delete fData;
    191    }
    192 }
    193 
    194 RuleBasedTransliterator*
    195 RuleBasedTransliterator::clone() const {
    196    return new RuleBasedTransliterator(*this);
    197 }
    198 
    199 /**
    200 * Implements {@link Transliterator#handleTransliterate}.
    201 */
    202 void
    203 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
    204                                             UBool isIncremental) const {
    205    /* We keep contextStart and contextLimit fixed the entire time,
    206     * relative to the text -- contextLimit may move numerically if
    207     * text is inserted or removed.  The start offset moves toward
    208     * limit, with replacements happening under it.
    209     *
    210     * Example: rules 1. ab>x|y
    211     *                2. yc>z
    212     *
    213     * |eabcd   begin - no match, advance start
    214     * e|abcd   match rule 1 - change text & adjust start
    215     * ex|ycd   match rule 2 - change text & adjust start
    216     * exz|d    no match, advance start
    217     * exzd|    done
    218     */
    219 
    220    /* A rule like
    221     *   a>b|a
    222     * creates an infinite loop. To prevent that, we put an arbitrary
    223     * limit on the number of iterations that we take, one that is
    224     * high enough that any reasonable rules are ok, but low enough to
    225     * prevent a server from hanging.  The limit is 16 times the
    226     * number of characters n, unless n is so large that 16n exceeds a
    227     * uint32_t.
    228     */
    229    uint32_t loopCount = 0;
    230    uint32_t loopLimit = index.limit - index.start;
    231    if (loopLimit >= 0x10000000) {
    232        loopLimit = 0xFFFFFFFF;
    233    } else {
    234        loopLimit <<= 4;
    235    }
    236 
    237    // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
    238    //   operations must be prevented.  
    239    // A Complication: compound transliterators can result in recursive entries to this
    240    //   function, sometimes with different "This" objects, always with the same text. 
    241    //   Double-locking must be prevented in these cases.
    242    //   
    243 
    244    UBool    lockedMutexAtThisLevel = false;
    245 
    246    // Test whether this request is operating on the same text string as
    247    //   some other transliteration that is still in progress and holding the 
    248    //   transliteration mutex.  If so, do not lock the transliteration
    249    //    mutex again.
    250    //
    251    //  gLockedText variable is protected by the global ICU mutex.
    252    //  Shared RBT data protected by transliteratorDataMutex.
    253    //
    254    // TODO(andy): Need a better scheme for handling this.
    255 
    256    static UMutex transliteratorDataMutex;
    257    UBool needToLock;
    258    {
    259        Mutex m;
    260        needToLock = (&text != gLockedText);
    261    }
    262    if (needToLock) {
    263        umtx_lock(&transliteratorDataMutex);  // Contention, longish waits possible here.
    264        Mutex m;
    265        gLockedText = &text;
    266        lockedMutexAtThisLevel = true;
    267    }
    268    
    269    // Check to make sure we don't dereference a null pointer.
    270    if (fData != nullptr) {
    271     while (index.start < index.limit &&
    272            loopCount <= loopLimit &&
    273            fData->ruleSet.transliterate(text, index, isIncremental)) {
    274         ++loopCount;
    275     }
    276    }
    277    if (lockedMutexAtThisLevel) {
    278        {
    279            Mutex m;
    280            gLockedText = nullptr;
    281        }
    282        umtx_unlock(&transliteratorDataMutex);
    283    }
    284 }
    285 
    286 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
    287                                                UBool escapeUnprintable) const {
    288    return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
    289 }
    290 
    291 /**
    292 * Implement Transliterator framework
    293 */
    294 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
    295    fData->ruleSet.getSourceTargetSet(result, false);
    296 }
    297 
    298 /**
    299 * Override Transliterator framework
    300 */
    301 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
    302    return fData->ruleSet.getSourceTargetSet(result, true);
    303 }
    304 
    305 U_NAMESPACE_END
    306 
    307 #endif /* #if !UCONFIG_NO_TRANSLITERATION */