[ tor-browser ].git.dasho

unisetspan.cpp (62307B)
      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2007-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  unisetspan.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2007mar01
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/utf8.h"
     23 #include "unicode/utf16.h"
     24 #include "cmemory.h"
     25 #include "uvector.h"
     26 #include "unisetspan.h"
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 /*
     31 * List of offsets from the current position from where to try matching
     32 * a code point or a string.
     33 * Store offsets rather than indexes to simplify the code and use the same list
     34 * for both increments (in span()) and decrements (in spanBack()).
     35 *
     36 * Assumption: The maximum offset is limited, and the offsets that are stored
     37 * at any one time are relatively dense, that is, there are normally no gaps of
     38 * hundreds or thousands of offset values.
     39 *
     40 * The implementation uses a circular buffer of byte flags,
     41 * each indicating whether the corresponding offset is in the list.
     42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
     43 * physically moving part of the list.
     44 *
     45 * Note: In principle, the caller should setMaxLength() to the maximum of the
     46 * max string length and U16_LENGTH/U8_LENGTH to account for
     47 * "long" single code points.
     48 * However, this implementation uses at least a staticList with more than
     49 * U8_LENGTH entries anyway.
     50 *
     51 * Note: If maxLength were guaranteed to be no more than 32 or 64,
     52 * the list could be stored as bit flags in a single integer.
     53 * Rather than handling a circular buffer with a start list index,
     54 * the integer would simply be shifted when lower offsets are removed.
     55 * UnicodeSet does not have a limit on the lengths of strings.
     56 */
     57 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
     58 public:
     59    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
     60 
     61    ~OffsetList() {
     62        if(list!=staticList) {
     63            uprv_free(list);
     64        }
     65    }
     66 
     67    // Call exactly once if the list is to be used.
     68    void setMaxLength(int32_t maxLength) {
     69        if (maxLength <= static_cast<int32_t>(sizeof(staticList))) {
     70            capacity = static_cast<int32_t>(sizeof(staticList));
     71        } else {
     72            UBool* l = static_cast<UBool*>(uprv_malloc(maxLength));
     73            if(l!=nullptr) {
     74                list=l;
     75                capacity=maxLength;
     76            }
     77        }
     78        uprv_memset(list, 0, capacity);
     79    }
     80 
     81    void clear() {
     82        uprv_memset(list, 0, capacity);
     83        start=length=0;
     84    }
     85 
     86    UBool isEmpty() const {
     87        return length == 0;
     88    }
     89 
     90    // Reduce all stored offsets by delta, used when the current position
     91    // moves by delta.
     92    // There must not be any offsets lower than delta.
     93    // If there is an offset equal to delta, it is removed.
     94    // delta=[1..maxLength]
     95    void shift(int32_t delta) {
     96        int32_t i=start+delta;
     97        if(i>=capacity) {
     98            i-=capacity;
     99        }
    100        if(list[i]) {
    101            list[i]=false;
    102            --length;
    103        }
    104        start=i;
    105    }
    106 
    107    // Add an offset. The list must not contain it yet.
    108    // offset=[1..maxLength]
    109    void addOffset(int32_t offset) {
    110        int32_t i=start+offset;
    111        if(i>=capacity) {
    112            i-=capacity;
    113        }
    114        list[i]=true;
    115        ++length;
    116    }
    117 
    118    // offset=[1..maxLength]
    119    UBool containsOffset(int32_t offset) const {
    120        int32_t i=start+offset;
    121        if(i>=capacity) {
    122            i-=capacity;
    123        }
    124        return list[i];
    125    }
    126 
    127    // Find the lowest stored offset from a non-empty list, remove it,
    128    // and reduce all other offsets by this minimum.
    129    // Returns [1..maxLength].
    130    int32_t popMinimum() {
    131        // Look for the next offset in list[start+1..capacity-1].
    132        int32_t i=start, result;
    133        while(++i<capacity) {
    134            if(list[i]) {
    135                list[i]=false;
    136                --length;
    137                result=i-start;
    138                start=i;
    139                return result;
    140            }
    141        }
    142        // i==capacity
    143 
    144        // Wrap around and look for the next offset in list[0..start].
    145        // Since the list is not empty, there will be one.
    146        result=capacity-start;
    147        i=0;
    148        while(!list[i]) {
    149            ++i;
    150        }
    151        list[i]=false;
    152        --length;
    153        start=i;
    154        return result+=i;
    155    }
    156 
    157 private:
    158    UBool *list;
    159    int32_t capacity;
    160    int32_t length;
    161    int32_t start;
    162 
    163    UBool staticList[16];
    164 };
    165 
    166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
    167 static int32_t
    168 getUTF8Length(const char16_t *s, int32_t length) {
    169    UErrorCode errorCode=U_ZERO_ERROR;
    170    int32_t length8=0;
    171    u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode);
    172    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
    173        return length8;
    174    } else {
    175        // The string contains an unpaired surrogate.
    176        // Ignore this string.
    177        return 0;
    178    }
    179 }
    180 
    181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
    182 static int32_t
    183 appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) {
    184    UErrorCode errorCode=U_ZERO_ERROR;
    185    int32_t length8=0;
    186    u_strToUTF8(reinterpret_cast<char*>(t), capacity, &length8, s, length, &errorCode);
    187    if(U_SUCCESS(errorCode)) {
    188        return length8;
    189    } else {
    190        // The string contains an unpaired surrogate.
    191        // Ignore this string.
    192        return 0;
    193    }
    194 }
    195 
    196 static inline uint8_t
    197 makeSpanLengthByte(int32_t spanLength) {
    198    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
    199    return spanLength < 0xfe ? static_cast<uint8_t>(spanLength) : static_cast<uint8_t>(0xfe);
    200 }
    201 
    202 // Construct for all variants of span(), or only for any one variant.
    203 // Initialize as little as possible, for single use.
    204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
    205                                           const UVector &setStrings,
    206                                           uint32_t which)
    207        : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings),
    208          utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
    209          utf8Length(0),
    210          maxLength16(0), maxLength8(0),
    211          all(static_cast<UBool>(which == ALL)) {
    212    spanSet.retainAll(set);
    213    if(which&NOT_CONTAINED) {
    214        // Default to the same sets.
    215        // addToSpanNotSet() will create a separate set if necessary.
    216        pSpanNotSet=&spanSet;
    217    }
    218 
    219    // Determine if the strings even need to be taken into account at all for span() etc.
    220    // If any string is relevant, then all strings need to be used for
    221    // span(longest match) but only the relevant ones for span(while contained).
    222    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
    223    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
    224    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
    225    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
    226    int32_t stringsLength=strings.size();
    227 
    228    int32_t i, spanLength;
    229    UBool someRelevant=false;
    230    for(i=0; i<stringsLength; ++i) {
    231        const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    232        const char16_t *s16=string.getBuffer();
    233        int32_t length16=string.length();
    234        if (length16==0) {
    235            continue;  // skip the empty string
    236        }
    237        UBool thisRelevant;
    238        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    239        if(spanLength<length16) {  // Relevant string.
    240            someRelevant=thisRelevant=true;
    241        } else {
    242            thisRelevant=false;
    243        }
    244        if((which&UTF16) && length16>maxLength16) {
    245            maxLength16=length16;
    246        }
    247        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
    248            int32_t length8=getUTF8Length(s16, length16);
    249            utf8Length+=length8;
    250            if(length8>maxLength8) {
    251                maxLength8=length8;
    252            }
    253        }
    254    }
    255    if(!someRelevant) {
    256        maxLength16=maxLength8=0;
    257        return;
    258    }
    259 
    260    // Freeze after checking for the need to use strings at all because freezing
    261    // a set takes some time and memory which are wasted if there are no relevant strings.
    262    if(all) {
    263        spanSet.freeze();
    264    }
    265 
    266    uint8_t *spanBackLengths;
    267    uint8_t *spanUTF8Lengths;
    268    uint8_t *spanBackUTF8Lengths;
    269 
    270    // Allocate a block of meta data.
    271    int32_t allocSize;
    272    if(all) {
    273        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    274        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    275    } else {
    276        allocSize=stringsLength;  // One set of span lengths.
    277        if(which&UTF8) {
    278            // UTF-8 lengths and UTF-8 strings.
    279            allocSize+=stringsLength*4+utf8Length;
    280        }
    281    }
    282    if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) {
    283        utf8Lengths=staticLengths;
    284    } else {
    285        utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize));
    286        if(utf8Lengths==nullptr) {
    287            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return false.
    288            return;  // Out of memory.
    289        }
    290    }
    291 
    292    if(all) {
    293        // Store span lengths for all span() variants.
    294        spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
    295        spanBackLengths=spanLengths+stringsLength;
    296        spanUTF8Lengths=spanBackLengths+stringsLength;
    297        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
    298        utf8=spanBackUTF8Lengths+stringsLength;
    299    } else {
    300        // Store span lengths for only one span() variant.
    301        if(which&UTF8) {
    302            spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
    303            utf8=spanLengths+stringsLength;
    304        } else {
    305            spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths);
    306        }
    307        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
    308    }
    309 
    310    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
    311    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
    312 
    313    for(i=0; i<stringsLength; ++i) {
    314        const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    315        const char16_t *s16=string.getBuffer();
    316        int32_t length16=string.length();
    317        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
    318        if(spanLength<length16 && length16>0) {  // Relevant string.
    319            if(which&UTF16) {
    320                if(which&CONTAINED) {
    321                    if(which&FWD) {
    322                        spanLengths[i]=makeSpanLengthByte(spanLength);
    323                    }
    324                    if(which&BACK) {
    325                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
    326                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
    327                    }
    328                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    329                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
    330                }
    331            }
    332            if(which&UTF8) {
    333                uint8_t *s8=utf8+utf8Count;
    334                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    335                utf8Count+=utf8Lengths[i]=length8;
    336                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
    337                    spanUTF8Lengths[i] = spanBackUTF8Lengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED);
    338                } else {  // Relevant for UTF-8.
    339                    if(which&CONTAINED) {
    340                        if(which&FWD) {
    341                            spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED);
    342                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    343                        }
    344                        if(which&BACK) {
    345                            spanLength = length8 - spanSet.spanBackUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED);
    346                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
    347                        }
    348                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
    349                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
    350                    }
    351                }
    352            }
    353            if(which&NOT_CONTAINED) {
    354                // Add string start and end code points to the spanNotSet so that
    355                // a span(while not contained) stops before any string.
    356                UChar32 c;
    357                if(which&FWD) {
    358                    int32_t len=0;
    359                    U16_NEXT(s16, len, length16, c);
    360                    addToSpanNotSet(c);
    361                }
    362                if(which&BACK) {
    363                    int32_t len=length16;
    364                    U16_PREV(s16, 0, len, c);
    365                    addToSpanNotSet(c);
    366                }
    367            }
    368        } else {  // Irrelevant string. (Also the empty string.)
    369            if(which&UTF8) {
    370                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
    371                    uint8_t *s8=utf8+utf8Count;
    372                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
    373                    utf8Count+=utf8Lengths[i]=length8;
    374                } else {
    375                    utf8Lengths[i]=0;
    376                }
    377            }
    378            if(all) {
    379                spanLengths[i]=spanBackLengths[i]=
    380                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
    381                        static_cast<uint8_t>(ALL_CP_CONTAINED);
    382            } else {
    383                // All spanXYZLengths pointers contain the same address.
    384                spanLengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED);
    385            }
    386        }
    387    }
    388 
    389    // Finish.
    390    if(all) {
    391        pSpanNotSet->freeze();
    392    }
    393 }
    394 
    395 // Copy constructor. Assumes which==ALL for a frozen set.
    396 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
    397                                           const UVector &newParentSetStrings)
    398        : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings),
    399          utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr),
    400          utf8Length(otherStringSpan.utf8Length),
    401          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
    402          all(true) {
    403    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
    404        pSpanNotSet=&spanSet;
    405    } else {
    406        pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
    407    }
    408 
    409    // Allocate a block of meta data.
    410    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
    411    int32_t stringsLength=strings.size();
    412    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
    413    if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) {
    414        utf8Lengths=staticLengths;
    415    } else {
    416        utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize));
    417        if(utf8Lengths==nullptr) {
    418            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return false.
    419            return;  // Out of memory.
    420        }
    421    }
    422 
    423    spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength);
    424    utf8=spanLengths+stringsLength*4;
    425    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
    426 }
    427 
    428 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
    429    if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) {
    430        delete pSpanNotSet;
    431    }
    432    if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) {
    433        uprv_free(utf8Lengths);
    434    }
    435 }
    436 
    437 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
    438    if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) {
    439        if(spanSet.contains(c)) {
    440            return;  // Nothing to do.
    441        }
    442        UnicodeSet *newSet=spanSet.cloneAsThawed();
    443        if(newSet==nullptr) {
    444            return;  // Out of memory.
    445        } else {
    446            pSpanNotSet=newSet;
    447        }
    448    }
    449    pSpanNotSet->add(c);
    450 }
    451 
    452 // Compare strings without any argument checks. Requires length>0.
    453 static inline UBool
    454 matches16(const char16_t *s, const char16_t *t, int32_t length) {
    455    do {
    456        if(*s++!=*t++) {
    457            return false;
    458        }
    459    } while(--length>0);
    460    return true;
    461 }
    462 
    463 static inline UBool
    464 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
    465    do {
    466        if(*s++!=*t++) {
    467            return false;
    468        }
    469    } while(--length>0);
    470    return true;
    471 }
    472 
    473 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
    474 // at code point boundaries.
    475 // That is, each edge of a match must not be in the middle of a surrogate pair.
    476 static inline UBool
    477 matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) {
    478    s+=start;
    479    limit-=start;
    480    return matches16(s, t, length) &&
    481           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
    482           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
    483 }
    484 
    485 // Does the set contain the next code point?
    486 // If so, return its length; otherwise return its negative length.
    487 static inline int32_t
    488 spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) {
    489    char16_t c=*s, c2;
    490    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
    491        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
    492    }
    493    return set.contains(c) ? 1 : -1;
    494 }
    495 
    496 static inline int32_t
    497 spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) {
    498    char16_t c=s[length-1], c2;
    499    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
    500        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
    501    }
    502    return set.contains(c) ? 1 : -1;
    503 }
    504 
    505 static inline int32_t
    506 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    507    UChar32 c=*s;
    508    if(U8_IS_SINGLE(c)) {
    509        return set.contains(c) ? 1 : -1;
    510    }
    511    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
    512    int32_t i=0;
    513    U8_NEXT_OR_FFFD(s, i, length, c);
    514    return set.contains(c) ? i : -i;
    515 }
    516 
    517 static inline int32_t
    518 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
    519    UChar32 c=s[length-1];
    520    if(U8_IS_SINGLE(c)) {
    521        return set.contains(c) ? 1 : -1;
    522    }
    523    int32_t i=length-1;
    524    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
    525    length-=i;
    526    return set.contains(c) ? length : -length;
    527 }
    528 
    529 /*
    530 * Note: In span() when spanLength==0 (after a string match, or at the beginning
    531 * after an empty code point span) and in spanNot() and spanNotUTF8(),
    532 * string matching could use a binary search
    533 * because all string matches are done from the same start index.
    534 *
    535 * For UTF-8, this would require a comparison function that returns UTF-16 order.
    536 *
    537 * This optimization should not be necessary for normal UnicodeSets because
    538 * most sets have no strings, and most sets with strings have
    539 * very few very short strings.
    540 * For cases with many strings, it might be better to use a different API
    541 * and implementation with a DFA (state machine).
    542 */
    543 
    544 /*
    545 * Algorithm for span(USET_SPAN_CONTAINED)
    546 *
    547 * Theoretical algorithm:
    548 * - Iterate through the string, and at each code point boundary:
    549 *   + If the code point there is in the set, then remember to continue after it.
    550 *   + If a set string matches at the current position, then remember to continue after it.
    551 *   + Either recursively span for each code point or string match,
    552 *     or recursively span for all but the shortest one and
    553 *     iteratively continue the span with the shortest local match.
    554 *   + Remember the longest recursive span (the farthest end point).
    555 *   + If there is no match at the current position, neither for the code point there
    556 *     nor for any set string, then stop and return the longest recursive span length.
    557 *
    558 * Optimized implementation:
    559 *
    560 * (We assume that most sets will have very few very short strings.
    561 * A span using a string-less set is extremely fast.)
    562 *
    563 * Create and cache a spanSet which contains all of the single code points
    564 * of the original set but none of its strings.
    565 *
    566 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    567 * - Loop:
    568 *   + Try to match each set string at the end of the spanLength.
    569 *     ~ Set strings that start with set-contained code points must be matched
    570 *       with a partial overlap because the recursive algorithm would have tried
    571 *       to match them at every position.
    572 *     ~ Set strings that entirely consist of set-contained code points
    573 *       are irrelevant for span(USET_SPAN_CONTAINED) because the
    574 *       recursive algorithm would continue after them anyway
    575 *       and find the longest recursive match from their end.
    576 *     ~ Rather than recursing, note each end point of a set string match.
    577 *   + If no set string matched after spanSet.span(), then return
    578 *     with where the spanSet.span() ended.
    579 *   + If at least one set string matched after spanSet.span(), then
    580 *     pop the shortest string match end point and continue
    581 *     the loop, trying to match all set strings from there.
    582 *   + If at least one more set string matched after a previous string match,
    583 *     then test if the code point after the previous string match is also
    584 *     contained in the set.
    585 *     Continue the loop with the shortest end point of either this code point
    586 *     or a matching set string.
    587 *   + If no more set string matched after a previous string match,
    588 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    589 *     Stop if spanLength==0, otherwise continue the loop.
    590 *
    591 * By noting each end point of a set string match,
    592 * the function visits each string position at most once and finishes
    593 * in linear time.
    594 *
    595 * The recursive algorithm may visit the same string position many times
    596 * if multiple paths lead to it and finishes in exponential time.
    597 */
    598 
    599 /*
    600 * Algorithm for span(USET_SPAN_SIMPLE)
    601 *
    602 * Theoretical algorithm:
    603 * - Iterate through the string, and at each code point boundary:
    604 *   + If the code point there is in the set, then remember to continue after it.
    605 *   + If a set string matches at the current position, then remember to continue after it.
    606 *   + Continue from the farthest match position and ignore all others.
    607 *   + If there is no match at the current position,
    608 *     then stop and return the current position.
    609 *
    610 * Optimized implementation:
    611 *
    612 * (Same assumption and spanSet as above.)
    613 *
    614 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
    615 * - Loop:
    616 *   + Try to match each set string at the end of the spanLength.
    617 *     ~ Set strings that start with set-contained code points must be matched
    618 *       with a partial overlap because the standard algorithm would have tried
    619 *       to match them earlier.
    620 *     ~ Set strings that entirely consist of set-contained code points
    621 *       must be matched with a full overlap because the longest-match algorithm
    622 *       would hide set string matches that end earlier.
    623 *       Such set strings need not be matched earlier inside the code point span
    624 *       because the standard algorithm would then have continued after
    625 *       the set string match anyway.
    626 *     ~ Remember the longest set string match (farthest end point) from the earliest
    627 *       starting point.
    628 *   + If no set string matched after spanSet.span(), then return
    629 *     with where the spanSet.span() ended.
    630 *   + If at least one set string matched, then continue the loop after the
    631 *     longest match from the earliest position.
    632 *   + If no more set string matched after a previous string match,
    633 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
    634 *     Stop if spanLength==0, otherwise continue the loop.
    635 */
    636 
    637 int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
    638    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    639        return spanNot(s, length);
    640    }
    641    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
    642    if(spanLength==length) {
    643        return length;
    644    }
    645 
    646    // Consider strings; they may overlap with the span.
    647    OffsetList offsets;
    648    if(spanCondition==USET_SPAN_CONTAINED) {
    649        // Use offset list to try all possibilities.
    650        offsets.setMaxLength(maxLength16);
    651    }
    652    int32_t pos=spanLength, rest=length-pos;
    653    int32_t i, stringsLength=strings.size();
    654    for(;;) {
    655        if(spanCondition==USET_SPAN_CONTAINED) {
    656            for(i=0; i<stringsLength; ++i) {
    657                int32_t overlap=spanLengths[i];
    658                if(overlap==ALL_CP_CONTAINED) {
    659                    continue;  // Irrelevant string. (Also the empty string.)
    660                }
    661                const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    662                const char16_t *s16=string.getBuffer();
    663                int32_t length16=string.length();
    664                U_ASSERT(length>0);
    665 
    666                // Try to match this string at pos-overlap..pos.
    667                if(overlap>=LONG_SPAN) {
    668                    overlap=length16;
    669                    // While contained: No point matching fully inside the code point span.
    670                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
    671                }
    672                if(overlap>spanLength) {
    673                    overlap=spanLength;
    674                }
    675                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    676                for(;;) {
    677                    if(inc>rest) {
    678                        break;
    679                    }
    680                    // Try to match if the increment is not listed already.
    681                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
    682                        if(inc==rest) {
    683                            return length;  // Reached the end of the string.
    684                        }
    685                        offsets.addOffset(inc);
    686                    }
    687                    if(overlap==0) {
    688                        break;
    689                    }
    690                    --overlap;
    691                    ++inc;
    692                }
    693            }
    694        } else /* USET_SPAN_SIMPLE */ {
    695            int32_t maxInc=0, maxOverlap=0;
    696            for(i=0; i<stringsLength; ++i) {
    697                int32_t overlap=spanLengths[i];
    698                // For longest match, we do need to try to match even an all-contained string
    699                // to find the match from the earliest start.
    700 
    701                const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    702                const char16_t *s16=string.getBuffer();
    703                int32_t length16=string.length();
    704                if (length16==0) {
    705                    continue;  // skip the empty string
    706                }
    707 
    708                // Try to match this string at pos-overlap..pos.
    709                if(overlap>=LONG_SPAN) {
    710                    overlap=length16;
    711                    // Longest match: Need to match fully inside the code point span
    712                    // to find the match from the earliest start.
    713                }
    714                if(overlap>spanLength) {
    715                    overlap=spanLength;
    716                }
    717                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
    718                for(;;) {
    719                    if(inc>rest || overlap<maxOverlap) {
    720                        break;
    721                    }
    722                    // Try to match if the string is longer or starts earlier.
    723                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
    724                        matches16CPB(s, pos-overlap, length, s16, length16)
    725                    ) {
    726                        maxInc=inc;  // Longest match from earliest start.
    727                        maxOverlap=overlap;
    728                        break;
    729                    }
    730                    --overlap;
    731                    ++inc;
    732                }
    733            }
    734 
    735            if(maxInc!=0 || maxOverlap!=0) {
    736                // Longest-match algorithm, and there was a string match.
    737                // Simply continue after it.
    738                pos+=maxInc;
    739                rest-=maxInc;
    740                if(rest==0) {
    741                    return length;  // Reached the end of the string.
    742                }
    743                spanLength=0;  // Match strings from after a string match.
    744                continue;
    745            }
    746        }
    747        // Finished trying to match all strings at pos.
    748 
    749        if(spanLength!=0 || pos==0) {
    750            // The position is after an unlimited code point span (spanLength!=0),
    751            // not after a string match.
    752            // The only position where spanLength==0 after a span is pos==0.
    753            // Otherwise, an unlimited code point span is only tried again when no
    754            // strings match, and if such a non-initial span fails we stop.
    755            if(offsets.isEmpty()) {
    756                return pos;  // No strings matched after a span.
    757            }
    758            // Match strings from after the next string match.
    759        } else {
    760            // The position is after a string match (or a single code point).
    761            if(offsets.isEmpty()) {
    762                // No more strings matched after a previous string match.
    763                // Try another code point span from after the last string match.
    764                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
    765                if( spanLength==rest || // Reached the end of the string, or
    766                    spanLength==0       // neither strings nor span progressed.
    767                ) {
    768                    return pos+spanLength;
    769                }
    770                pos+=spanLength;
    771                rest-=spanLength;
    772                continue;  // spanLength>0: Match strings from after a span.
    773            } else {
    774                // Try to match only one code point from after a string match if some
    775                // string matched beyond it, so that we try all possible positions
    776                // and don't overshoot.
    777                spanLength=spanOne(spanSet, s+pos, rest);
    778                if(spanLength>0) {
    779                    if(spanLength==rest) {
    780                        return length;  // Reached the end of the string.
    781                    }
    782                    // Match strings after this code point.
    783                    // There cannot be any increments below it because UnicodeSet strings
    784                    // contain multiple code points.
    785                    pos+=spanLength;
    786                    rest-=spanLength;
    787                    offsets.shift(spanLength);
    788                    spanLength=0;
    789                    continue;  // Match strings from after a single code point.
    790                }
    791                // Match strings from after the next string match.
    792            }
    793        }
    794        int32_t minOffset=offsets.popMinimum();
    795        pos+=minOffset;
    796        rest-=minOffset;
    797        spanLength=0;  // Match strings from after a string match.
    798    }
    799 }
    800 
    801 int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
    802    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    803        return spanNotBack(s, length);
    804    }
    805    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
    806    if(pos==0) {
    807        return 0;
    808    }
    809    int32_t spanLength=length-pos;
    810 
    811    // Consider strings; they may overlap with the span.
    812    OffsetList offsets;
    813    if(spanCondition==USET_SPAN_CONTAINED) {
    814        // Use offset list to try all possibilities.
    815        offsets.setMaxLength(maxLength16);
    816    }
    817    int32_t i, stringsLength=strings.size();
    818    uint8_t *spanBackLengths=spanLengths;
    819    if(all) {
    820        spanBackLengths+=stringsLength;
    821    }
    822    for(;;) {
    823        if(spanCondition==USET_SPAN_CONTAINED) {
    824            for(i=0; i<stringsLength; ++i) {
    825                int32_t overlap=spanBackLengths[i];
    826                if(overlap==ALL_CP_CONTAINED) {
    827                    continue;  // Irrelevant string. (Also the empty string.)
    828                }
    829                const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    830                const char16_t *s16=string.getBuffer();
    831                int32_t length16=string.length();
    832                U_ASSERT(length>0);
    833 
    834                // Try to match this string at pos-(length16-overlap)..pos-length16.
    835                if(overlap>=LONG_SPAN) {
    836                    overlap=length16;
    837                    // While contained: No point matching fully inside the code point span.
    838                    int32_t len1=0;
    839                    U16_FWD_1(s16, len1, overlap);
    840                    overlap-=len1;  // Length of the string minus the first code point.
    841                }
    842                if(overlap>spanLength) {
    843                    overlap=spanLength;
    844                }
    845                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    846                for(;;) {
    847                    if(dec>pos) {
    848                        break;
    849                    }
    850                    // Try to match if the decrement is not listed already.
    851                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
    852                        if(dec==pos) {
    853                            return 0;  // Reached the start of the string.
    854                        }
    855                        offsets.addOffset(dec);
    856                    }
    857                    if(overlap==0) {
    858                        break;
    859                    }
    860                    --overlap;
    861                    ++dec;
    862                }
    863            }
    864        } else /* USET_SPAN_SIMPLE */ {
    865            int32_t maxDec=0, maxOverlap=0;
    866            for(i=0; i<stringsLength; ++i) {
    867                int32_t overlap=spanBackLengths[i];
    868                // For longest match, we do need to try to match even an all-contained string
    869                // to find the match from the latest end.
    870 
    871                const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
    872                const char16_t *s16=string.getBuffer();
    873                int32_t length16=string.length();
    874                if (length16==0) {
    875                    continue;  // skip the empty string
    876                }
    877 
    878                // Try to match this string at pos-(length16-overlap)..pos-length16.
    879                if(overlap>=LONG_SPAN) {
    880                    overlap=length16;
    881                    // Longest match: Need to match fully inside the code point span
    882                    // to find the match from the latest end.
    883                }
    884                if(overlap>spanLength) {
    885                    overlap=spanLength;
    886                }
    887                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
    888                for(;;) {
    889                    if(dec>pos || overlap<maxOverlap) {
    890                        break;
    891                    }
    892                    // Try to match if the string is longer or ends later.
    893                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
    894                        matches16CPB(s, pos-dec, length, s16, length16)
    895                    ) {
    896                        maxDec=dec;  // Longest match from latest end.
    897                        maxOverlap=overlap;
    898                        break;
    899                    }
    900                    --overlap;
    901                    ++dec;
    902                }
    903            }
    904 
    905            if(maxDec!=0 || maxOverlap!=0) {
    906                // Longest-match algorithm, and there was a string match.
    907                // Simply continue before it.
    908                pos-=maxDec;
    909                if(pos==0) {
    910                    return 0;  // Reached the start of the string.
    911                }
    912                spanLength=0;  // Match strings from before a string match.
    913                continue;
    914            }
    915        }
    916        // Finished trying to match all strings at pos.
    917 
    918        if(spanLength!=0 || pos==length) {
    919            // The position is before an unlimited code point span (spanLength!=0),
    920            // not before a string match.
    921            // The only position where spanLength==0 before a span is pos==length.
    922            // Otherwise, an unlimited code point span is only tried again when no
    923            // strings match, and if such a non-initial span fails we stop.
    924            if(offsets.isEmpty()) {
    925                return pos;  // No strings matched before a span.
    926            }
    927            // Match strings from before the next string match.
    928        } else {
    929            // The position is before a string match (or a single code point).
    930            if(offsets.isEmpty()) {
    931                // No more strings matched before a previous string match.
    932                // Try another code point span from before the last string match.
    933                int32_t oldPos=pos;
    934                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
    935                spanLength=oldPos-pos;
    936                if( pos==0 ||           // Reached the start of the string, or
    937                    spanLength==0       // neither strings nor span progressed.
    938                ) {
    939                    return pos;
    940                }
    941                continue;  // spanLength>0: Match strings from before a span.
    942            } else {
    943                // Try to match only one code point from before a string match if some
    944                // string matched beyond it, so that we try all possible positions
    945                // and don't overshoot.
    946                spanLength=spanOneBack(spanSet, s, pos);
    947                if(spanLength>0) {
    948                    if(spanLength==pos) {
    949                        return 0;  // Reached the start of the string.
    950                    }
    951                    // Match strings before this code point.
    952                    // There cannot be any decrements below it because UnicodeSet strings
    953                    // contain multiple code points.
    954                    pos-=spanLength;
    955                    offsets.shift(spanLength);
    956                    spanLength=0;
    957                    continue;  // Match strings from before a single code point.
    958                }
    959                // Match strings from before the next string match.
    960            }
    961        }
    962        pos-=offsets.popMinimum();
    963        spanLength=0;  // Match strings from before a string match.
    964    }
    965 }
    966 
    967 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    968    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    969        return spanNotUTF8(s, length);
    970    }
    971    int32_t spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED);
    972    if(spanLength==length) {
    973        return length;
    974    }
    975 
    976    // Consider strings; they may overlap with the span.
    977    OffsetList offsets;
    978    if(spanCondition==USET_SPAN_CONTAINED) {
    979        // Use offset list to try all possibilities.
    980        offsets.setMaxLength(maxLength8);
    981    }
    982    int32_t pos=spanLength, rest=length-pos;
    983    int32_t i, stringsLength=strings.size();
    984    uint8_t *spanUTF8Lengths=spanLengths;
    985    if(all) {
    986        spanUTF8Lengths+=2*stringsLength;
    987    }
    988    for(;;) {
    989        const uint8_t *s8=utf8;
    990        int32_t length8;
    991        if(spanCondition==USET_SPAN_CONTAINED) {
    992            for(i=0; i<stringsLength; ++i) {
    993                length8=utf8Lengths[i];
    994                if(length8==0) {
    995                    continue;  // String not representable in UTF-8.
    996                }
    997                int32_t overlap=spanUTF8Lengths[i];
    998                if(overlap==ALL_CP_CONTAINED) {
    999                    s8+=length8;
   1000                    continue;  // Irrelevant string.
   1001                }
   1002 
   1003                // Try to match this string at pos-overlap..pos.
   1004                if(overlap>=LONG_SPAN) {
   1005                    overlap=length8;
   1006                    // While contained: No point matching fully inside the code point span.
   1007                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
   1008                }
   1009                if(overlap>spanLength) {
   1010                    overlap=spanLength;
   1011                }
   1012                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1013                for(;;) {
   1014                    if(inc>rest) {
   1015                        break;
   1016                    }
   1017                    // Try to match if the increment is not listed already.
   1018                    // Match at code point boundaries. (The UTF-8 strings were converted
   1019                    // from UTF-16 and are guaranteed to be well-formed.)
   1020                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
   1021                            !offsets.containsOffset(inc) &&
   1022                            matches8(s+pos-overlap, s8, length8)) {
   1023                        if(inc==rest) {
   1024                            return length;  // Reached the end of the string.
   1025                        }
   1026                        offsets.addOffset(inc);
   1027                    }
   1028                    if(overlap==0) {
   1029                        break;
   1030                    }
   1031                    --overlap;
   1032                    ++inc;
   1033                }
   1034                s8+=length8;
   1035            }
   1036        } else /* USET_SPAN_SIMPLE */ {
   1037            int32_t maxInc=0, maxOverlap=0;
   1038            for(i=0; i<stringsLength; ++i) {
   1039                length8=utf8Lengths[i];
   1040                if(length8==0) {
   1041                    continue;  // String not representable in UTF-8.
   1042                }
   1043                int32_t overlap=spanUTF8Lengths[i];
   1044                // For longest match, we do need to try to match even an all-contained string
   1045                // to find the match from the earliest start.
   1046 
   1047                // Try to match this string at pos-overlap..pos.
   1048                if(overlap>=LONG_SPAN) {
   1049                    overlap=length8;
   1050                    // Longest match: Need to match fully inside the code point span
   1051                    // to find the match from the earliest start.
   1052                }
   1053                if(overlap>spanLength) {
   1054                    overlap=spanLength;
   1055                }
   1056                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
   1057                for(;;) {
   1058                    if(inc>rest || overlap<maxOverlap) {
   1059                        break;
   1060                    }
   1061                    // Try to match if the string is longer or starts earlier.
   1062                    // Match at code point boundaries. (The UTF-8 strings were converted
   1063                    // from UTF-16 and are guaranteed to be well-formed.)
   1064                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
   1065                            (overlap>maxOverlap ||
   1066                                /* redundant overlap==maxOverlap && */ inc>maxInc) &&
   1067                            matches8(s+pos-overlap, s8, length8)) {
   1068                        maxInc=inc;  // Longest match from earliest start.
   1069                        maxOverlap=overlap;
   1070                        break;
   1071                    }
   1072                    --overlap;
   1073                    ++inc;
   1074                }
   1075                s8+=length8;
   1076            }
   1077 
   1078            if(maxInc!=0 || maxOverlap!=0) {
   1079                // Longest-match algorithm, and there was a string match.
   1080                // Simply continue after it.
   1081                pos+=maxInc;
   1082                rest-=maxInc;
   1083                if(rest==0) {
   1084                    return length;  // Reached the end of the string.
   1085                }
   1086                spanLength=0;  // Match strings from after a string match.
   1087                continue;
   1088            }
   1089        }
   1090        // Finished trying to match all strings at pos.
   1091 
   1092        if(spanLength!=0 || pos==0) {
   1093            // The position is after an unlimited code point span (spanLength!=0),
   1094            // not after a string match.
   1095            // The only position where spanLength==0 after a span is pos==0.
   1096            // Otherwise, an unlimited code point span is only tried again when no
   1097            // strings match, and if such a non-initial span fails we stop.
   1098            if(offsets.isEmpty()) {
   1099                return pos;  // No strings matched after a span.
   1100            }
   1101            // Match strings from after the next string match.
   1102        } else {
   1103            // The position is after a string match (or a single code point).
   1104            if(offsets.isEmpty()) {
   1105                // No more strings matched after a previous string match.
   1106                // Try another code point span from after the last string match.
   1107                spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_CONTAINED);
   1108                if( spanLength==rest || // Reached the end of the string, or
   1109                    spanLength==0       // neither strings nor span progressed.
   1110                ) {
   1111                    return pos+spanLength;
   1112                }
   1113                pos+=spanLength;
   1114                rest-=spanLength;
   1115                continue;  // spanLength>0: Match strings from after a span.
   1116            } else {
   1117                // Try to match only one code point from after a string match if some
   1118                // string matched beyond it, so that we try all possible positions
   1119                // and don't overshoot.
   1120                spanLength=spanOneUTF8(spanSet, s+pos, rest);
   1121                if(spanLength>0) {
   1122                    if(spanLength==rest) {
   1123                        return length;  // Reached the end of the string.
   1124                    }
   1125                    // Match strings after this code point.
   1126                    // There cannot be any increments below it because UnicodeSet strings
   1127                    // contain multiple code points.
   1128                    pos+=spanLength;
   1129                    rest-=spanLength;
   1130                    offsets.shift(spanLength);
   1131                    spanLength=0;
   1132                    continue;  // Match strings from after a single code point.
   1133                }
   1134                // Match strings from after the next string match.
   1135            }
   1136        }
   1137        int32_t minOffset=offsets.popMinimum();
   1138        pos+=minOffset;
   1139        rest-=minOffset;
   1140        spanLength=0;  // Match strings from after a string match.
   1141    }
   1142 }
   1143 
   1144 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1145    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   1146        return spanNotBackUTF8(s, length);
   1147    }
   1148    int32_t pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED);
   1149    if(pos==0) {
   1150        return 0;
   1151    }
   1152    int32_t spanLength=length-pos;
   1153 
   1154    // Consider strings; they may overlap with the span.
   1155    OffsetList offsets;
   1156    if(spanCondition==USET_SPAN_CONTAINED) {
   1157        // Use offset list to try all possibilities.
   1158        offsets.setMaxLength(maxLength8);
   1159    }
   1160    int32_t i, stringsLength=strings.size();
   1161    uint8_t *spanBackUTF8Lengths=spanLengths;
   1162    if(all) {
   1163        spanBackUTF8Lengths+=3*stringsLength;
   1164    }
   1165    for(;;) {
   1166        const uint8_t *s8=utf8;
   1167        int32_t length8;
   1168        if(spanCondition==USET_SPAN_CONTAINED) {
   1169            for(i=0; i<stringsLength; ++i) {
   1170                length8=utf8Lengths[i];
   1171                if(length8==0) {
   1172                    continue;  // String not representable in UTF-8.
   1173                }
   1174                int32_t overlap=spanBackUTF8Lengths[i];
   1175                if(overlap==ALL_CP_CONTAINED) {
   1176                    s8+=length8;
   1177                    continue;  // Irrelevant string.
   1178                }
   1179 
   1180                // Try to match this string at pos-(length8-overlap)..pos-length8.
   1181                if(overlap>=LONG_SPAN) {
   1182                    overlap=length8;
   1183                    // While contained: No point matching fully inside the code point span.
   1184                    int32_t len1=0;
   1185                    U8_FWD_1(s8, len1, overlap);
   1186                    overlap-=len1;  // Length of the string minus the first code point.
   1187                }
   1188                if(overlap>spanLength) {
   1189                    overlap=spanLength;
   1190                }
   1191                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1192                for(;;) {
   1193                    if(dec>pos) {
   1194                        break;
   1195                    }
   1196                    // Try to match if the decrement is not listed already.
   1197                    // Match at code point boundaries. (The UTF-8 strings were converted
   1198                    // from UTF-16 and are guaranteed to be well-formed.)
   1199                    if( !U8_IS_TRAIL(s[pos-dec]) &&
   1200                        !offsets.containsOffset(dec) &&
   1201                        matches8(s+pos-dec, s8, length8)
   1202                    ) {
   1203                        if(dec==pos) {
   1204                            return 0;  // Reached the start of the string.
   1205                        }
   1206                        offsets.addOffset(dec);
   1207                    }
   1208                    if(overlap==0) {
   1209                        break;
   1210                    }
   1211                    --overlap;
   1212                    ++dec;
   1213                }
   1214                s8+=length8;
   1215            }
   1216        } else /* USET_SPAN_SIMPLE */ {
   1217            int32_t maxDec=0, maxOverlap=0;
   1218            for(i=0; i<stringsLength; ++i) {
   1219                length8=utf8Lengths[i];
   1220                if(length8==0) {
   1221                    continue;  // String not representable in UTF-8.
   1222                }
   1223                int32_t overlap=spanBackUTF8Lengths[i];
   1224                // For longest match, we do need to try to match even an all-contained string
   1225                // to find the match from the latest end.
   1226 
   1227                // Try to match this string at pos-(length8-overlap)..pos-length8.
   1228                if(overlap>=LONG_SPAN) {
   1229                    overlap=length8;
   1230                    // Longest match: Need to match fully inside the code point span
   1231                    // to find the match from the latest end.
   1232                }
   1233                if(overlap>spanLength) {
   1234                    overlap=spanLength;
   1235                }
   1236                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
   1237                for(;;) {
   1238                    if(dec>pos || overlap<maxOverlap) {
   1239                        break;
   1240                    }
   1241                    // Try to match if the string is longer or ends later.
   1242                    // Match at code point boundaries. (The UTF-8 strings were converted
   1243                    // from UTF-16 and are guaranteed to be well-formed.)
   1244                    if( !U8_IS_TRAIL(s[pos-dec]) &&
   1245                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
   1246                        matches8(s+pos-dec, s8, length8)
   1247                    ) {
   1248                        maxDec=dec;  // Longest match from latest end.
   1249                        maxOverlap=overlap;
   1250                        break;
   1251                    }
   1252                    --overlap;
   1253                    ++dec;
   1254                }
   1255                s8+=length8;
   1256            }
   1257 
   1258            if(maxDec!=0 || maxOverlap!=0) {
   1259                // Longest-match algorithm, and there was a string match.
   1260                // Simply continue before it.
   1261                pos-=maxDec;
   1262                if(pos==0) {
   1263                    return 0;  // Reached the start of the string.
   1264                }
   1265                spanLength=0;  // Match strings from before a string match.
   1266                continue;
   1267            }
   1268        }
   1269        // Finished trying to match all strings at pos.
   1270 
   1271        if(spanLength!=0 || pos==length) {
   1272            // The position is before an unlimited code point span (spanLength!=0),
   1273            // not before a string match.
   1274            // The only position where spanLength==0 before a span is pos==length.
   1275            // Otherwise, an unlimited code point span is only tried again when no
   1276            // strings match, and if such a non-initial span fails we stop.
   1277            if(offsets.isEmpty()) {
   1278                return pos;  // No strings matched before a span.
   1279            }
   1280            // Match strings from before the next string match.
   1281        } else {
   1282            // The position is before a string match (or a single code point).
   1283            if(offsets.isEmpty()) {
   1284                // No more strings matched before a previous string match.
   1285                // Try another code point span from before the last string match.
   1286                int32_t oldPos=pos;
   1287                pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), oldPos, USET_SPAN_CONTAINED);
   1288                spanLength=oldPos-pos;
   1289                if( pos==0 ||           // Reached the start of the string, or
   1290                    spanLength==0       // neither strings nor span progressed.
   1291                ) {
   1292                    return pos;
   1293                }
   1294                continue;  // spanLength>0: Match strings from before a span.
   1295            } else {
   1296                // Try to match only one code point from before a string match if some
   1297                // string matched beyond it, so that we try all possible positions
   1298                // and don't overshoot.
   1299                spanLength=spanOneBackUTF8(spanSet, s, pos);
   1300                if(spanLength>0) {
   1301                    if(spanLength==pos) {
   1302                        return 0;  // Reached the start of the string.
   1303                    }
   1304                    // Match strings before this code point.
   1305                    // There cannot be any decrements below it because UnicodeSet strings
   1306                    // contain multiple code points.
   1307                    pos-=spanLength;
   1308                    offsets.shift(spanLength);
   1309                    spanLength=0;
   1310                    continue;  // Match strings from before a single code point.
   1311                }
   1312                // Match strings from before the next string match.
   1313            }
   1314        }
   1315        pos-=offsets.popMinimum();
   1316        spanLength=0;  // Match strings from before a string match.
   1317    }
   1318 }
   1319 
   1320 /*
   1321 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
   1322 *
   1323 * Theoretical algorithm:
   1324 * - Iterate through the string, and at each code point boundary:
   1325 *   + If the code point there is in the set, then return with the current position.
   1326 *   + If a set string matches at the current position, then return with the current position.
   1327 *
   1328 * Optimized implementation:
   1329 *
   1330 * (Same assumption as for span() above.)
   1331 *
   1332 * Create and cache a spanNotSet which contains all of the single code points
   1333 * of the original set but none of its strings.
   1334 * For each set string add its initial code point to the spanNotSet.
   1335 * (Also add its final code point for spanNotBack().)
   1336 *
   1337 * - Loop:
   1338 *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
   1339 *   + If the current code point is in the original set, then
   1340 *     return the current position.
   1341 *   + If any set string matches at the current position, then
   1342 *     return the current position.
   1343 *   + If there is no match at the current position, neither for the code point there
   1344 *     nor for any set string, then skip this code point and continue the loop.
   1345 *     This happens for set-string-initial code points that were added to spanNotSet
   1346 *     when there is not actually a match for such a set string.
   1347 */
   1348 
   1349 int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const {
   1350    int32_t pos=0, rest=length;
   1351    int32_t i, stringsLength=strings.size();
   1352    do {
   1353        // Span until we find a code point from the set,
   1354        // or a code point that starts or ends some string.
   1355        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
   1356        if(i==rest) {
   1357            return length;  // Reached the end of the string.
   1358        }
   1359        pos+=i;
   1360        rest-=i;
   1361 
   1362        // Check whether the current code point is in the original set,
   1363        // without the string starts and ends.
   1364        int32_t cpLength=spanOne(spanSet, s+pos, rest);
   1365        if(cpLength>0) {
   1366            return pos;  // There is a set element at pos.
   1367        }
   1368 
   1369        // Try to match the strings at pos.
   1370        for(i=0; i<stringsLength; ++i) {
   1371            if(spanLengths[i]==ALL_CP_CONTAINED) {
   1372                continue;  // Irrelevant string. (Also the empty string.)
   1373            }
   1374            const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
   1375            const char16_t *s16=string.getBuffer();
   1376            int32_t length16=string.length();
   1377            U_ASSERT(length>0);
   1378            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
   1379                return pos;  // There is a set element at pos.
   1380            }
   1381        }
   1382 
   1383        // The span(while not contained) ended on a string start/end which is
   1384        // not in the original set. Skip this code point and continue.
   1385        // cpLength<0
   1386        pos-=cpLength;
   1387        rest+=cpLength;
   1388    } while(rest!=0);
   1389    return length;  // Reached the end of the string.
   1390 }
   1391 
   1392 int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const {
   1393    int32_t pos=length;
   1394    int32_t i, stringsLength=strings.size();
   1395    do {
   1396        // Span until we find a code point from the set,
   1397        // or a code point that starts or ends some string.
   1398        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
   1399        if(pos==0) {
   1400            return 0;  // Reached the start of the string.
   1401        }
   1402 
   1403        // Check whether the current code point is in the original set,
   1404        // without the string starts and ends.
   1405        int32_t cpLength=spanOneBack(spanSet, s, pos);
   1406        if(cpLength>0) {
   1407            return pos;  // There is a set element at pos.
   1408        }
   1409 
   1410        // Try to match the strings at pos.
   1411        for(i=0; i<stringsLength; ++i) {
   1412            // Use spanLengths rather than a spanBackLengths pointer because
   1413            // it is easier and we only need to know whether the string is irrelevant
   1414            // which is the same in either array.
   1415            if(spanLengths[i]==ALL_CP_CONTAINED) {
   1416                continue;  // Irrelevant string. (Also the empty string.)
   1417            }
   1418            const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i));
   1419            const char16_t *s16=string.getBuffer();
   1420            int32_t length16=string.length();
   1421            U_ASSERT(length>0);
   1422            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
   1423                return pos;  // There is a set element at pos.
   1424            }
   1425        }
   1426 
   1427        // The span(while not contained) ended on a string start/end which is
   1428        // not in the original set. Skip this code point and continue.
   1429        // cpLength<0
   1430        pos+=cpLength;
   1431    } while(pos!=0);
   1432    return 0;  // Reached the start of the string.
   1433 }
   1434 
   1435 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
   1436    int32_t pos=0, rest=length;
   1437    int32_t i, stringsLength=strings.size();
   1438    uint8_t *spanUTF8Lengths=spanLengths;
   1439    if(all) {
   1440        spanUTF8Lengths+=2*stringsLength;
   1441    }
   1442    do {
   1443        // Span until we find a code point from the set,
   1444        // or a code point that starts or ends some string.
   1445        i = pSpanNotSet->spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_NOT_CONTAINED);
   1446        if(i==rest) {
   1447            return length;  // Reached the end of the string.
   1448        }
   1449        pos+=i;
   1450        rest-=i;
   1451 
   1452        // Check whether the current code point is in the original set,
   1453        // without the string starts and ends.
   1454        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
   1455        if(cpLength>0) {
   1456            return pos;  // There is a set element at pos.
   1457        }
   1458 
   1459        // Try to match the strings at pos.
   1460        const uint8_t *s8=utf8;
   1461        int32_t length8;
   1462        for(i=0; i<stringsLength; ++i) {
   1463            length8=utf8Lengths[i];
   1464            // ALL_CP_CONTAINED: Irrelevant string.
   1465            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
   1466                return pos;  // There is a set element at pos.
   1467            }
   1468            s8+=length8;
   1469        }
   1470 
   1471        // The span(while not contained) ended on a string start/end which is
   1472        // not in the original set. Skip this code point and continue.
   1473        // cpLength<0
   1474        pos-=cpLength;
   1475        rest+=cpLength;
   1476    } while(rest!=0);
   1477    return length;  // Reached the end of the string.
   1478 }
   1479 
   1480 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
   1481    int32_t pos=length;
   1482    int32_t i, stringsLength=strings.size();
   1483    uint8_t *spanBackUTF8Lengths=spanLengths;
   1484    if(all) {
   1485        spanBackUTF8Lengths+=3*stringsLength;
   1486    }
   1487    do {
   1488        // Span until we find a code point from the set,
   1489        // or a code point that starts or ends some string.
   1490        pos = pSpanNotSet->spanBackUTF8(reinterpret_cast<const char*>(s), pos, USET_SPAN_NOT_CONTAINED);
   1491        if(pos==0) {
   1492            return 0;  // Reached the start of the string.
   1493        }
   1494 
   1495        // Check whether the current code point is in the original set,
   1496        // without the string starts and ends.
   1497        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
   1498        if(cpLength>0) {
   1499            return pos;  // There is a set element at pos.
   1500        }
   1501 
   1502        // Try to match the strings at pos.
   1503        const uint8_t *s8=utf8;
   1504        int32_t length8;
   1505        for(i=0; i<stringsLength; ++i) {
   1506            length8=utf8Lengths[i];
   1507            // ALL_CP_CONTAINED: Irrelevant string.
   1508            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
   1509                return pos;  // There is a set element at pos.
   1510            }
   1511            s8+=length8;
   1512        }
   1513 
   1514        // The span(while not contained) ended on a string start/end which is
   1515        // not in the original set. Skip this code point and continue.
   1516        // cpLength<0
   1517        pos+=cpLength;
   1518    } while(pos!=0);
   1519    return 0;  // Reached the start of the string.
   1520 }
   1521 
   1522 U_NAMESPACE_END
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE