tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

strmatch.cpp (8844B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2012, International Business Machines Corporation
      6 *   and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   07/23/01    aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "strmatch.h"
     18 #include "rbt_data.h"
     19 #include "util.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/utf16.h"
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
     26 
     27 StringMatcher::StringMatcher(const UnicodeString& theString,
     28                             int32_t start,
     29                             int32_t limit,
     30                             int32_t segmentNum,
     31                             const TransliterationRuleData& theData) :
     32    data(&theData),
     33    segmentNumber(segmentNum),
     34    matchStart(-1),
     35    matchLimit(-1)
     36 {
     37    theString.extractBetween(start, limit, pattern);
     38 }
     39 
     40 StringMatcher::StringMatcher(const StringMatcher& o) :
     41    UnicodeFunctor(o),
     42    UnicodeMatcher(o),
     43    UnicodeReplacer(o),
     44    pattern(o.pattern),
     45    data(o.data),
     46    segmentNumber(o.segmentNumber),
     47    matchStart(o.matchStart),
     48    matchLimit(o.matchLimit)
     49 {
     50 }
     51 
     52 /**
     53 * Destructor
     54 */
     55 StringMatcher::~StringMatcher() {
     56 }
     57 
     58 /**
     59 * Implement UnicodeFunctor
     60 */
     61 StringMatcher* StringMatcher::clone() const {
     62    return new StringMatcher(*this);
     63 }
     64 
     65 /**
     66 * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
     67 * and return the pointer.
     68 */
     69 UnicodeMatcher* StringMatcher::toMatcher() const {
     70  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     71  UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
     72  
     73  return nonconst_base;
     74 }
     75 
     76 /**
     77 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
     78 * and return the pointer.
     79 */
     80 UnicodeReplacer* StringMatcher::toReplacer() const {
     81  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
     82  UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
     83  
     84  return nonconst_base;
     85 }
     86 
     87 /**
     88 * Implement UnicodeMatcher
     89 */
     90 UMatchDegree StringMatcher::matches(const Replaceable& text,
     91                                    int32_t& offset,
     92                                    int32_t limit,
     93                                    UBool incremental) {
     94    int32_t i;
     95    int32_t cursor = offset;
     96    if (limit < cursor) {
     97        // Match in the reverse direction
     98        for (i=pattern.length()-1; i>=0; --i) {
     99            char16_t keyChar = pattern.charAt(i);
    100            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    101            if (subm == nullptr) {
    102                if (cursor > limit &&
    103                    keyChar == text.charAt(cursor)) {
    104                    --cursor;
    105                } else {
    106                    return U_MISMATCH;
    107                }
    108            } else {
    109                UMatchDegree m =
    110                    subm->matches(text, cursor, limit, incremental);
    111                if (m != U_MATCH) {
    112                    return m;
    113                }
    114            }
    115        }
    116        // Record the match position, but adjust for a normal
    117        // forward start, limit, and only if a prior match does not
    118        // exist -- we want the rightmost match.
    119        if (matchStart < 0) {
    120            matchStart = cursor+1;
    121            matchLimit = offset+1;
    122        }
    123    } else {
    124        for (i=0; i<pattern.length(); ++i) {
    125            if (incremental && cursor == limit) {
    126                // We've reached the context limit without a mismatch and
    127                // without completing our match.
    128                return U_PARTIAL_MATCH;
    129            }
    130            char16_t keyChar = pattern.charAt(i);
    131            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    132            if (subm == nullptr) {
    133                // Don't need the cursor < limit check if
    134                // incremental is true (because it's done above); do need
    135                // it otherwise.
    136                if (cursor < limit &&
    137                    keyChar == text.charAt(cursor)) {
    138                    ++cursor;
    139                } else {
    140                    return U_MISMATCH;
    141                }
    142            } else {
    143                UMatchDegree m =
    144                    subm->matches(text, cursor, limit, incremental);
    145                if (m != U_MATCH) {
    146                    return m;
    147                }
    148            }
    149        }
    150        // Record the match position
    151        matchStart = offset;
    152        matchLimit = cursor;
    153    }
    154 
    155    offset = cursor;
    156    return U_MATCH;
    157 }
    158 
    159 /**
    160 * Implement UnicodeMatcher
    161 */
    162 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
    163                                        UBool escapeUnprintable) const
    164 {
    165    result.truncate(0);
    166    UnicodeString str, quoteBuf;
    167    if (segmentNumber > 0) {
    168        result.append(static_cast<char16_t>(40)); /*(*/
    169    }
    170    for (int32_t i=0; i<pattern.length(); ++i) {
    171        char16_t keyChar = pattern.charAt(i);
    172        const UnicodeMatcher* m = data->lookupMatcher(keyChar);
    173        if (m == nullptr) {
    174            ICU_Utility::appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
    175        } else {
    176            ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
    177                         true, escapeUnprintable, quoteBuf);
    178        }
    179    }
    180    if (segmentNumber > 0) {
    181        result.append(static_cast<char16_t>(41)); /*)*/
    182    }
    183    // Flush quoteBuf out to result
    184    ICU_Utility::appendToRule(result, -1,
    185                              true, escapeUnprintable, quoteBuf);
    186    return result;
    187 }
    188 
    189 /**
    190 * Implement UnicodeMatcher
    191 */
    192 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
    193    if (pattern.length() == 0) {
    194        return true;
    195    }
    196    UChar32 c = pattern.char32At(0);
    197    const UnicodeMatcher *m = data->lookupMatcher(c);
    198    return (m == nullptr) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
    199 }
    200 
    201 /**
    202 * Implement UnicodeMatcher
    203 */
    204 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
    205    UChar32 ch;
    206    for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
    207        ch = pattern.char32At(i);
    208        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
    209        if (matcher == nullptr) {
    210            toUnionTo.add(ch);
    211        } else {
    212            matcher->addMatchSetTo(toUnionTo);
    213        }
    214    }
    215 }
    216 
    217 /**
    218 * UnicodeReplacer API
    219 */
    220 int32_t StringMatcher::replace(Replaceable& text,
    221                               int32_t start,
    222                               int32_t limit,
    223                               int32_t& /*cursor*/) {
    224    
    225    int32_t outLen = 0;
    226    
    227    // Copy segment with out-of-band data
    228    int32_t dest = limit;
    229    // If there was no match, that means that a quantifier
    230    // matched zero-length.  E.g., x (a)* y matched "xy".
    231    if (matchStart >= 0) {
    232        if (matchStart != matchLimit) {
    233            text.copy(matchStart, matchLimit, dest);
    234            outLen = matchLimit - matchStart;
    235        }
    236    }
    237    
    238    text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
    239    
    240    return outLen;
    241 }
    242 
    243 /**
    244 * UnicodeReplacer API
    245 */
    246 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
    247                                                UBool /*escapeUnprintable*/) const {
    248    // assert(segmentNumber > 0);
    249    rule.truncate(0);
    250    rule.append(static_cast<char16_t>(0x0024) /*$*/);
    251    ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
    252    return rule;
    253 }
    254 
    255 /**
    256 * Remove any match info.  This must be called before performing a
    257 * set of matches with this segment.
    258 */
    259 void StringMatcher::resetMatch() {
    260    matchStart = matchLimit = -1;
    261 }
    262 
    263 /**
    264 * Union the set of all characters that may output by this object
    265 * into the given set.
    266 * @param toUnionTo the set into which to union the output characters
    267 */
    268 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
    269    // The output of this replacer varies; it is the source text between
    270    // matchStart and matchLimit.  Since this varies depending on the
    271    // input text, we can't compute it here.  We can either do nothing
    272    // or we can add ALL characters to the set.  It's probably more useful
    273    // to do nothing.
    274 }
    275 
    276 /**
    277 * Implement UnicodeFunctor
    278 */
    279 void StringMatcher::setData(const TransliterationRuleData* d) {
    280    data = d;
    281    int32_t i = 0;
    282    while (i<pattern.length()) {
    283        UChar32 c = pattern.char32At(i);
    284        UnicodeFunctor* f = data->lookup(c);
    285        if (f != nullptr) {
    286            f->setData(data);
    287        }
    288        i += U16_LENGTH(c);
    289    }
    290 }
    291 
    292 U_NAMESPACE_END
    293 
    294 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    295 
    296 //eof