tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

bmpset.cpp (25240B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2007-2012, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  bmpset.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2007jan29
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/utf8.h"
     22 #include "unicode/utf16.h"
     23 #include "cmemory.h"
     24 #include "bmpset.h"
     25 #include "uassert.h"
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
     30        list(parentList), listLength(parentListLength) {
     31    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
     32    uprv_memset(table7FF, 0, sizeof(table7FF));
     33    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
     34 
     35    /*
     36     * Set the list indexes for binary searches for
     37     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
     38     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
     39     * looked up in the bit tables.
     40     * The last pair of indexes is for finding supplementary code points.
     41     */
     42    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
     43    int32_t i;
     44    for(i=1; i<=0x10; ++i) {
     45        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
     46    }
     47    list4kStarts[0x11]=listLength-1;
     48    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
     49 
     50    initBits();
     51    overrideIllegal();
     52 }
     53 
     54 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
     55        containsFFFD(otherBMPSet.containsFFFD),
     56        list(newParentList), listLength(newParentListLength) {
     57    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
     58    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
     59    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
     60    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
     61 }
     62 
     63 BMPSet::~BMPSet() {
     64 }
     65 
     66 /*
     67 * Set bits in a bit rectangle in "vertical" bit organization.
     68 * start<limit<=0x800
     69 */
     70 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
     71    U_ASSERT(start<limit);
     72    U_ASSERT(limit<=0x800);
     73 
     74    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
     75    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
     76 
     77    // Set one bit indicating an all-one block.
     78    uint32_t bits = static_cast<uint32_t>(1) << lead;
     79    if((start+1)==limit) {  // Single-character shortcut.
     80        table[trail]|=bits;
     81        return;
     82    }
     83 
     84    int32_t limitLead=limit>>6;
     85    int32_t limitTrail=limit&0x3f;
     86 
     87    if(lead==limitLead) {
     88        // Partial vertical bit column.
     89        while(trail<limitTrail) {
     90            table[trail++]|=bits;
     91        }
     92    } else {
     93        // Partial vertical bit column,
     94        // followed by a bit rectangle,
     95        // followed by another partial vertical bit column.
     96        if(trail>0) {
     97            do {
     98                table[trail++]|=bits;
     99            } while(trail<64);
    100            ++lead;
    101        }
    102        if(lead<limitLead) {
    103            bits = ~((static_cast<unsigned>(1) << lead) - 1);
    104            if(limitLead<0x20) {
    105                bits &= (static_cast<unsigned>(1) << limitLead) - 1;
    106            }
    107            for(trail=0; trail<64; ++trail) {
    108                table[trail]|=bits;
    109            }
    110        }
    111        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
    112        // In that case, bits=1<<limitLead is undefined but the bits value
    113        // is not used because trail<limitTrail is already false.
    114        bits = static_cast<uint32_t>(1) << ((limitLead == 0x20) ? (limitLead - 1) : limitLead);
    115        for(trail=0; trail<limitTrail; ++trail) {
    116            table[trail]|=bits;
    117        }
    118    }
    119 }
    120 
    121 void BMPSet::initBits() {
    122    UChar32 start, limit;
    123    int32_t listIndex=0;
    124 
    125    // Set latin1Contains[].
    126    do {
    127        start=list[listIndex++];
    128        if(listIndex<listLength) {
    129            limit=list[listIndex++];
    130        } else {
    131            limit=0x110000;
    132        }
    133        if(start>=0x100) {
    134            break;
    135        }
    136        do {
    137            latin1Contains[start++]=1;
    138        } while(start<limit && start<0x100);
    139    } while(limit<=0x100);
    140 
    141    // Find the first range overlapping with (or after) 80..FF again,
    142    // to include them in table7FF as well.
    143    for(listIndex=0;;) {
    144        start=list[listIndex++];
    145        if(listIndex<listLength) {
    146            limit=list[listIndex++];
    147        } else {
    148            limit=0x110000;
    149        }
    150        if(limit>0x80) {
    151            if(start<0x80) {
    152                start=0x80;
    153            }
    154            break;
    155        }
    156    }
    157 
    158    // Set table7FF[].
    159    while(start<0x800) {
    160        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
    161        if(limit>0x800) {
    162            start=0x800;
    163            break;
    164        }
    165 
    166        start=list[listIndex++];
    167        if(listIndex<listLength) {
    168            limit=list[listIndex++];
    169        } else {
    170            limit=0x110000;
    171        }
    172    }
    173 
    174    // Set bmpBlockBits[].
    175    int32_t minStart=0x800;
    176    while(start<0x10000) {
    177        if(limit>0x10000) {
    178            limit=0x10000;
    179        }
    180 
    181        if(start<minStart) {
    182            start=minStart;
    183        }
    184        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
    185            if(start&0x3f) {
    186                // Mixed-value block of 64 code points.
    187                start>>=6;
    188                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
    189                start=(start+1)<<6;  // Round up to the next block boundary.
    190                minStart=start;      // Ignore further ranges in this block.
    191            }
    192            if(start<limit) {
    193                if(start<(limit&~0x3f)) {
    194                    // Multiple all-ones blocks of 64 code points each.
    195                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
    196                }
    197 
    198                if(limit&0x3f) {
    199                    // Mixed-value block of 64 code points.
    200                    limit>>=6;
    201                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
    202                    limit=(limit+1)<<6;  // Round up to the next block boundary.
    203                    minStart=limit;      // Ignore further ranges in this block.
    204                }
    205            }
    206        }
    207 
    208        if(limit==0x10000) {
    209            break;
    210        }
    211 
    212        start=list[listIndex++];
    213        if(listIndex<listLength) {
    214            limit=list[listIndex++];
    215        } else {
    216            limit=0x110000;
    217        }
    218    }
    219 }
    220 
    221 /*
    222 * Override some bits and bytes to the result of contains(FFFD)
    223 * for faster validity checking at runtime.
    224 * No need to set 0 values where they were reset to 0 in the constructor
    225 * and not modified by initBits().
    226 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
    227 * Need to set 0 values for surrogates D800..DFFF.
    228 */
    229 void BMPSet::overrideIllegal() {
    230    uint32_t bits, mask;
    231    int32_t i;
    232 
    233    if(containsFFFD) {
    234        bits=3;                 // Lead bytes 0xC0 and 0xC1.
    235        for(i=0; i<64; ++i) {
    236            table7FF[i]|=bits;
    237        }
    238 
    239        bits=1;                 // Lead byte 0xE0.
    240        for(i=0; i<32; ++i) {   // First half of 4k block.
    241            bmpBlockBits[i]|=bits;
    242        }
    243 
    244        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
    245        bits=1<<0xd;
    246        for(i=32; i<64; ++i) {  // Second half of 4k block.
    247            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
    248        }
    249    } else {
    250        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
    251        for(i=32; i<64; ++i) {  // Second half of 4k block.
    252            bmpBlockBits[i]&=mask;
    253        }
    254    }
    255 }
    256 
    257 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
    258    /* Examples:
    259                                       findCodePoint(c)
    260       set              list[]         c=0 1 3 4 7 8
    261       ===              ==============   ===========
    262       []               [110000]         0 0 0 0 0 0
    263       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
    264       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
    265       [:Any:]          [0, 110000]      1 1 1 1 1 1
    266     */
    267 
    268    // Return the smallest i such that c < list[i].  Assume
    269    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
    270    if (c < list[lo])
    271        return lo;
    272    // High runner test.  c is often after the last range, so an
    273    // initial check for this condition pays off.
    274    if (lo >= hi || c >= list[hi-1])
    275        return hi;
    276    // invariant: c >= list[lo]
    277    // invariant: c < list[hi]
    278    for (;;) {
    279        int32_t i = (lo + hi) >> 1;
    280        if (i == lo) {
    281            break; // Found!
    282        } else if (c < list[i]) {
    283            hi = i;
    284        } else {
    285            lo = i;
    286        }
    287    }
    288    return hi;
    289 }
    290 
    291 UBool
    292 BMPSet::contains(UChar32 c) const {
    293    if (static_cast<uint32_t>(c) <= 0xff) {
    294        return latin1Contains[c];
    295    } else if (static_cast<uint32_t>(c) <= 0x7ff) {
    296        return (table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0;
    297    } else if (static_cast<uint32_t>(c) < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
    298        int lead=c>>12;
    299        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    300        if(twoBits<=1) {
    301            // All 64 code points with the same bits 15..6
    302            // are either in the set or not.
    303            return twoBits;
    304        } else {
    305            // Look up the code point in its 4k block of code points.
    306            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
    307        }
    308    } else if (static_cast<uint32_t>(c) <= 0x10ffff) {
    309        // surrogate or supplementary code point
    310        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
    311    } else {
    312        // Out-of-range code points get false, consistent with long-standing
    313        // behavior of UnicodeSet::contains(c).
    314        return false;
    315    }
    316 }
    317 
    318 /*
    319 * Check for sufficient length for trail unit for each surrogate pair.
    320 * Handle single surrogates as surrogate code points as usual in ICU.
    321 */
    322 const char16_t *
    323 BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
    324    char16_t c, c2;
    325 
    326    if(spanCondition) {
    327        // span
    328        do {
    329            c=*s;
    330            if(c<=0xff) {
    331                if(!latin1Contains[c]) {
    332                    break;
    333                }
    334            } else if(c<=0x7ff) {
    335                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {
    336                    break;
    337                }
    338            } else if(c<0xd800 || c>=0xe000) {
    339                int lead=c>>12;
    340                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    341                if(twoBits<=1) {
    342                    // All 64 code points with the same bits 15..6
    343                    // are either in the set or not.
    344                    if(twoBits==0) {
    345                        break;
    346                    }
    347                } else {
    348                    // Look up the code point in its 4k block of code points.
    349                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    350                        break;
    351                    }
    352                }
    353            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    354                // surrogate code point
    355                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    356                    break;
    357                }
    358            } else {
    359                // surrogate pair
    360                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    361                    break;
    362                }
    363                ++s;
    364            }
    365        } while(++s<limit);
    366    } else {
    367        // span not
    368        do {
    369            c=*s;
    370            if(c<=0xff) {
    371                if(latin1Contains[c]) {
    372                    break;
    373                }
    374            } else if(c<=0x7ff) {
    375                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {
    376                    break;
    377                }
    378            } else if(c<0xd800 || c>=0xe000) {
    379                int lead=c>>12;
    380                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    381                if(twoBits<=1) {
    382                    // All 64 code points with the same bits 15..6
    383                    // are either in the set or not.
    384                    if(twoBits!=0) {
    385                        break;
    386                    }
    387                } else {
    388                    // Look up the code point in its 4k block of code points.
    389                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    390                        break;
    391                    }
    392                }
    393            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
    394                // surrogate code point
    395                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    396                    break;
    397                }
    398            } else {
    399                // surrogate pair
    400                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
    401                    break;
    402                }
    403                ++s;
    404            }
    405        } while(++s<limit);
    406    }
    407    return s;
    408 }
    409 
    410 /* Symmetrical with span(). */
    411 const char16_t *
    412 BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
    413    char16_t c, c2;
    414 
    415    if(spanCondition) {
    416        // span
    417        for(;;) {
    418            c=*(--limit);
    419            if(c<=0xff) {
    420                if(!latin1Contains[c]) {
    421                    break;
    422                }
    423            } else if(c<=0x7ff) {
    424                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {
    425                    break;
    426                }
    427            } else if(c<0xd800 || c>=0xe000) {
    428                int lead=c>>12;
    429                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    430                if(twoBits<=1) {
    431                    // All 64 code points with the same bits 15..6
    432                    // are either in the set or not.
    433                    if(twoBits==0) {
    434                        break;
    435                    }
    436                } else {
    437                    // Look up the code point in its 4k block of code points.
    438                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    439                        break;
    440                    }
    441                }
    442            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    443                // surrogate code point
    444                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    445                    break;
    446                }
    447            } else {
    448                // surrogate pair
    449                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    450                    break;
    451                }
    452                --limit;
    453            }
    454            if(s==limit) {
    455                return s;
    456            }
    457        }
    458    } else {
    459        // span not
    460        for(;;) {
    461            c=*(--limit);
    462            if(c<=0xff) {
    463                if(latin1Contains[c]) {
    464                    break;
    465                }
    466            } else if(c<=0x7ff) {
    467                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {
    468                    break;
    469                }
    470            } else if(c<0xd800 || c>=0xe000) {
    471                int lead=c>>12;
    472                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    473                if(twoBits<=1) {
    474                    // All 64 code points with the same bits 15..6
    475                    // are either in the set or not.
    476                    if(twoBits!=0) {
    477                        break;
    478                    }
    479                } else {
    480                    // Look up the code point in its 4k block of code points.
    481                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
    482                        break;
    483                    }
    484                }
    485            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
    486                // surrogate code point
    487                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
    488                    break;
    489                }
    490            } else {
    491                // surrogate pair
    492                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
    493                    break;
    494                }
    495                --limit;
    496            }
    497            if(s==limit) {
    498                return s;
    499            }
    500        }
    501    }
    502    return limit+1;
    503 }
    504 
    505 /*
    506 * Precheck for sufficient trail bytes at end of string only once per span.
    507 * Check validity.
    508 */
    509 const uint8_t *
    510 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    511    const uint8_t *limit=s+length;
    512    uint8_t b=*s;
    513    if(U8_IS_SINGLE(b)) {
    514        // Initial all-ASCII span.
    515        if(spanCondition) {
    516            do {
    517                if(!latin1Contains[b] || ++s==limit) {
    518                    return s;
    519                }
    520                b=*s;
    521            } while(U8_IS_SINGLE(b));
    522        } else {
    523            do {
    524                if(latin1Contains[b] || ++s==limit) {
    525                    return s;
    526                }
    527                b=*s;
    528            } while(U8_IS_SINGLE(b));
    529        }
    530        length = static_cast<int32_t>(limit - s);
    531    }
    532 
    533    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    534        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    535    }
    536 
    537    const uint8_t *limit0=limit;
    538 
    539    /*
    540     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
    541     * or runs into a lead byte.
    542     * In the span loop compare s with limit only once
    543     * per multi-byte character.
    544     *
    545     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
    546     * including it if that is part of the span, otherwise set limit0 to before
    547     * the truncated sequence.
    548     */
    549    b=*(limit-1);
    550    if (static_cast<int8_t>(b) < 0) {
    551        // b>=0x80: lead or trail byte
    552        if(b<0xc0) {
    553            // single trail byte, check for preceding 3- or 4-byte lead byte
    554            if(length>=2 && (b=*(limit-2))>=0xe0) {
    555                limit-=2;
    556                if(containsFFFD!=spanCondition) {
    557                    limit0=limit;
    558                }
    559            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
    560                // 4-byte lead byte with only two trail bytes
    561                limit-=3;
    562                if(containsFFFD!=spanCondition) {
    563                    limit0=limit;
    564                }
    565            }
    566        } else {
    567            // lead byte with no trail bytes
    568            --limit;
    569            if(containsFFFD!=spanCondition) {
    570                limit0=limit;
    571            }
    572        }
    573    }
    574 
    575    uint8_t t1, t2, t3;
    576 
    577    while(s<limit) {
    578        b=*s;
    579        if(U8_IS_SINGLE(b)) {
    580            // ASCII
    581            if(spanCondition) {
    582                do {
    583                    if(!latin1Contains[b]) {
    584                        return s;
    585                    } else if(++s==limit) {
    586                        return limit0;
    587                    }
    588                    b=*s;
    589                } while(U8_IS_SINGLE(b));
    590            } else {
    591                do {
    592                    if(latin1Contains[b]) {
    593                        return s;
    594                    } else if(++s==limit) {
    595                        return limit0;
    596                    }
    597                    b=*s;
    598                } while(U8_IS_SINGLE(b));
    599            }
    600        }
    601        ++s;  // Advance past the lead byte.
    602        if(b>=0xe0) {
    603            if(b<0xf0) {
    604                if( /* handle U+0000..U+FFFF inline */
    605                    (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&
    606                    (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f
    607                ) {
    608                    b&=0xf;
    609                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
    610                    if(twoBits<=1) {
    611                        // All 64 code points with this lead byte and middle trail byte
    612                        // are either in the set or not.
    613                        if (twoBits != static_cast<uint32_t>(spanCondition)) {
    614                            return s-1;
    615                        }
    616                    } else {
    617                        // Look up the code point in its 4k block of code points.
    618                        UChar32 c=(b<<12)|(t1<<6)|t2;
    619                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
    620                            return s-1;
    621                        }
    622                    }
    623                    s+=2;
    624                    continue;
    625                }
    626            } else if( /* handle U+10000..U+10FFFF inline */
    627                (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&
    628                (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f &&
    629                (t3 = static_cast<uint8_t>(s[2] - 0x80)) <= 0x3f
    630            ) {
    631                // Give an illegal sequence the same value as the result of contains(FFFD).
    632                UChar32 c = (static_cast<UChar32>(b - 0xf0) << 18) | (static_cast<UChar32>(t1) << 12) | (t2 << 6) | t3;
    633                if( (   (0x10000<=c && c<=0x10ffff) ?
    634                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
    635                            containsFFFD
    636                    ) != spanCondition
    637                ) {
    638                    return s-1;
    639                }
    640                s+=3;
    641                continue;
    642            }
    643        } else {
    644            if( /* handle U+0000..U+07FF inline */
    645                b>=0xc0 &&
    646                (t1 = static_cast<uint8_t>(*s - 0x80)) <= 0x3f
    647            ) {
    648                if (static_cast<USetSpanCondition>((table7FF[t1] & (static_cast<uint32_t>(1) << (b & 0x1f))) != 0) != spanCondition) {
    649                    return s-1;
    650                }
    651                ++s;
    652                continue;
    653            }
    654        }
    655 
    656        // Give an illegal sequence the same value as the result of contains(FFFD).
    657        // Handle each byte of an illegal sequence separately to simplify the code;
    658        // no need to optimize error handling.
    659        if(containsFFFD!=spanCondition) {
    660            return s-1;
    661        }
    662    }
    663 
    664    return limit0;
    665 }
    666 
    667 /*
    668 * While going backwards through UTF-8 optimize only for ASCII.
    669 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
    670 * possible to tell from the last byte in a multi-byte sequence how many
    671 * preceding bytes there should be. Therefore, going backwards through UTF-8
    672 * is much harder than going forward.
    673 */
    674 int32_t
    675 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    676    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    677        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    678    }
    679 
    680    uint8_t b;
    681 
    682    do {
    683        b=s[--length];
    684        if(U8_IS_SINGLE(b)) {
    685            // ASCII sub-span
    686            if(spanCondition) {
    687                do {
    688                    if(!latin1Contains[b]) {
    689                        return length+1;
    690                    } else if(length==0) {
    691                        return 0;
    692                    }
    693                    b=s[--length];
    694                } while(U8_IS_SINGLE(b));
    695            } else {
    696                do {
    697                    if(latin1Contains[b]) {
    698                        return length+1;
    699                    } else if(length==0) {
    700                        return 0;
    701                    }
    702                    b=s[--length];
    703                } while(U8_IS_SINGLE(b));
    704            }
    705        }
    706 
    707        int32_t prev=length;
    708        UChar32 c;
    709        // trail byte: collect a multi-byte character
    710        // (or  lead byte in last-trail position)
    711        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
    712        // c is a valid code point, not ASCII, not a surrogate
    713        if(c<=0x7ff) {
    714            if (static_cast<USetSpanCondition>((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) != spanCondition) {
    715                return prev+1;
    716            }
    717        } else if(c<=0xffff) {
    718            int lead=c>>12;
    719            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
    720            if(twoBits<=1) {
    721                // All 64 code points with the same bits 15..6
    722                // are either in the set or not.
    723                if (twoBits != static_cast<uint32_t>(spanCondition)) {
    724                    return prev+1;
    725                }
    726            } else {
    727                // Look up the code point in its 4k block of code points.
    728                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
    729                    return prev+1;
    730                }
    731            }
    732        } else {
    733            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
    734                return prev+1;
    735            }
    736        }
    737    } while(length>0);
    738    return 0;
    739 }
    740 
    741 U_NAMESPACE_END