tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ucharstrie.cpp (12800B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2010-2011, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ucharstrie.h
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2010nov14
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/appendable.h"
     19 #include "unicode/ucharstrie.h"
     20 #include "unicode/uobject.h"
     21 #include "unicode/utf16.h"
     22 #include "cmemory.h"
     23 #include "uassert.h"
     24 
     25 U_NAMESPACE_BEGIN
     26 
     27 UCharsTrie::~UCharsTrie() {
     28    uprv_free(ownedArray_);
     29 }
     30 
     31 UStringTrieResult
     32 UCharsTrie::current() const {
     33    const char16_t *pos=pos_;
     34    if(pos==nullptr) {
     35        return USTRINGTRIE_NO_MATCH;
     36    } else {
     37        int32_t node;
     38        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
     39                valueResult(node) : USTRINGTRIE_NO_VALUE;
     40    }
     41 }
     42 
     43 UStringTrieResult
     44 UCharsTrie::firstForCodePoint(UChar32 cp) {
     45    return cp<=0xffff ?
     46        first(cp) :
     47        (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ?
     48            next(U16_TRAIL(cp)) :
     49            USTRINGTRIE_NO_MATCH);
     50 }
     51 
     52 UStringTrieResult
     53 UCharsTrie::nextForCodePoint(UChar32 cp) {
     54    return cp<=0xffff ?
     55        next(cp) :
     56        (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ?
     57            next(U16_TRAIL(cp)) :
     58            USTRINGTRIE_NO_MATCH);
     59 }
     60 
     61 UStringTrieResult
     62 UCharsTrie::branchNext(const char16_t *pos, int32_t length, int32_t uchar) {
     63    // Branch according to the current unit.
     64    if(length==0) {
     65        length=*pos++;
     66    }
     67    ++length;
     68    // The length of the branch is the number of units to select from.
     69    // The data structure encodes a binary search.
     70    while(length>kMaxBranchLinearSubNodeLength) {
     71        if(uchar<*pos++) {
     72            length>>=1;
     73            pos=jumpByDelta(pos);
     74        } else {
     75            length=length-(length>>1);
     76            pos=skipDelta(pos);
     77        }
     78    }
     79    // Drop down to linear search for the last few units.
     80    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
     81    // and divides length by 2.
     82    do {
     83        if(uchar==*pos++) {
     84            UStringTrieResult result;
     85            int32_t node=*pos;
     86            if(node&kValueIsFinal) {
     87                // Leave the final value for getValue() to read.
     88                result=USTRINGTRIE_FINAL_VALUE;
     89            } else {
     90                // Use the non-final value as the jump delta.
     91                ++pos;
     92                // int32_t delta=readValue(pos, node);
     93                int32_t delta;
     94                if(node<kMinTwoUnitValueLead) {
     95                    delta=node;
     96                } else if(node<kThreeUnitValueLead) {
     97                    delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
     98                } else {
     99                    delta=(pos[0]<<16)|pos[1];
    100                    pos+=2;
    101                }
    102                // end readValue()
    103                pos+=delta;
    104                node=*pos;
    105                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    106            }
    107            pos_=pos;
    108            return result;
    109        }
    110        --length;
    111        pos=skipValue(pos);
    112    } while(length>1);
    113    if(uchar==*pos++) {
    114        pos_=pos;
    115        int32_t node=*pos;
    116        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
    117    } else {
    118        stop();
    119        return USTRINGTRIE_NO_MATCH;
    120    }
    121 }
    122 
    123 UStringTrieResult
    124 UCharsTrie::nextImpl(const char16_t *pos, int32_t uchar) {
    125    int32_t node=*pos++;
    126    for(;;) {
    127        if(node<kMinLinearMatch) {
    128            return branchNext(pos, node, uchar);
    129        } else if(node<kMinValueLead) {
    130            // Match the first of length+1 units.
    131            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
    132            if(uchar==*pos++) {
    133                remainingMatchLength_=--length;
    134                pos_=pos;
    135                return (length<0 && (node=*pos)>=kMinValueLead) ?
    136                        valueResult(node) : USTRINGTRIE_NO_VALUE;
    137            } else {
    138                // No match.
    139                break;
    140            }
    141        } else if(node&kValueIsFinal) {
    142            // No further matching units.
    143            break;
    144        } else {
    145            // Skip intermediate value.
    146            pos=skipNodeValue(pos, node);
    147            node&=kNodeTypeMask;
    148        }
    149    }
    150    stop();
    151    return USTRINGTRIE_NO_MATCH;
    152 }
    153 
    154 UStringTrieResult
    155 UCharsTrie::next(int32_t uchar) {
    156    const char16_t *pos=pos_;
    157    if(pos==nullptr) {
    158        return USTRINGTRIE_NO_MATCH;
    159    }
    160    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    161    if(length>=0) {
    162        // Remaining part of a linear-match node.
    163        if(uchar==*pos++) {
    164            remainingMatchLength_=--length;
    165            pos_=pos;
    166            int32_t node;
    167            return (length<0 && (node=*pos)>=kMinValueLead) ?
    168                    valueResult(node) : USTRINGTRIE_NO_VALUE;
    169        } else {
    170            stop();
    171            return USTRINGTRIE_NO_MATCH;
    172        }
    173    }
    174    return nextImpl(pos, uchar);
    175 }
    176 
    177 UStringTrieResult
    178 UCharsTrie::next(ConstChar16Ptr ptr, int32_t sLength) {
    179    const char16_t *s=ptr;
    180    if(sLength<0 ? *s==0 : sLength==0) {
    181        // Empty input.
    182        return current();
    183    }
    184    const char16_t *pos=pos_;
    185    if(pos==nullptr) {
    186        return USTRINGTRIE_NO_MATCH;
    187    }
    188    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
    189    for(;;) {
    190        // Fetch the next input unit, if there is one.
    191        // Continue a linear-match node without rechecking sLength<0.
    192        int32_t uchar;
    193        if(sLength<0) {
    194            for(;;) {
    195                if((uchar=*s++)==0) {
    196                    remainingMatchLength_=length;
    197                    pos_=pos;
    198                    int32_t node;
    199                    return (length<0 && (node=*pos)>=kMinValueLead) ?
    200                            valueResult(node) : USTRINGTRIE_NO_VALUE;
    201                }
    202                if(length<0) {
    203                    remainingMatchLength_=length;
    204                    break;
    205                }
    206                if(uchar!=*pos) {
    207                    stop();
    208                    return USTRINGTRIE_NO_MATCH;
    209                }
    210                ++pos;
    211                --length;
    212            }
    213        } else {
    214            for(;;) {
    215                if(sLength==0) {
    216                    remainingMatchLength_=length;
    217                    pos_=pos;
    218                    int32_t node;
    219                    return (length<0 && (node=*pos)>=kMinValueLead) ?
    220                            valueResult(node) : USTRINGTRIE_NO_VALUE;
    221                }
    222                uchar=*s++;
    223                --sLength;
    224                if(length<0) {
    225                    remainingMatchLength_=length;
    226                    break;
    227                }
    228                if(uchar!=*pos) {
    229                    stop();
    230                    return USTRINGTRIE_NO_MATCH;
    231                }
    232                ++pos;
    233                --length;
    234            }
    235        }
    236        int32_t node=*pos++;
    237        for(;;) {
    238            if(node<kMinLinearMatch) {
    239                UStringTrieResult result=branchNext(pos, node, uchar);
    240                if(result==USTRINGTRIE_NO_MATCH) {
    241                    return USTRINGTRIE_NO_MATCH;
    242                }
    243                // Fetch the next input unit, if there is one.
    244                if(sLength<0) {
    245                    if((uchar=*s++)==0) {
    246                        return result;
    247                    }
    248                } else {
    249                    if(sLength==0) {
    250                        return result;
    251                    }
    252                    uchar=*s++;
    253                    --sLength;
    254                }
    255                if(result==USTRINGTRIE_FINAL_VALUE) {
    256                    // No further matching units.
    257                    stop();
    258                    return USTRINGTRIE_NO_MATCH;
    259                }
    260                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
    261                node=*pos++;
    262            } else if(node<kMinValueLead) {
    263                // Match length+1 units.
    264                length=node-kMinLinearMatch;  // Actual match length minus 1.
    265                if(uchar!=*pos) {
    266                    stop();
    267                    return USTRINGTRIE_NO_MATCH;
    268                }
    269                ++pos;
    270                --length;
    271                break;
    272            } else if(node&kValueIsFinal) {
    273                // No further matching units.
    274                stop();
    275                return USTRINGTRIE_NO_MATCH;
    276            } else {
    277                // Skip intermediate value.
    278                pos=skipNodeValue(pos, node);
    279                node&=kNodeTypeMask;
    280            }
    281        }
    282    }
    283 }
    284 
    285 const char16_t *
    286 UCharsTrie::findUniqueValueFromBranch(const char16_t *pos, int32_t length,
    287                                      UBool haveUniqueValue, int32_t &uniqueValue) {
    288    while(length>kMaxBranchLinearSubNodeLength) {
    289        ++pos;  // ignore the comparison unit
    290        if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
    291            return nullptr;
    292        }
    293        length=length-(length>>1);
    294        pos=skipDelta(pos);
    295    }
    296    do {
    297        ++pos;  // ignore a comparison unit
    298        // handle its value
    299        int32_t node=*pos++;
    300        UBool isFinal = static_cast<UBool>(node >> 15);
    301        node&=0x7fff;
    302        int32_t value=readValue(pos, node);
    303        pos=skipValue(pos, node);
    304        if(isFinal) {
    305            if(haveUniqueValue) {
    306                if(value!=uniqueValue) {
    307                    return nullptr;
    308                }
    309            } else {
    310                uniqueValue=value;
    311                haveUniqueValue=true;
    312            }
    313        } else {
    314            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
    315                return nullptr;
    316            }
    317            haveUniqueValue=true;
    318        }
    319    } while(--length>1);
    320    return pos+1;  // ignore the last comparison unit
    321 }
    322 
    323 UBool
    324 UCharsTrie::findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
    325    int32_t node=*pos++;
    326    for(;;) {
    327        if(node<kMinLinearMatch) {
    328            if(node==0) {
    329                node=*pos++;
    330            }
    331            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
    332            if(pos==nullptr) {
    333                return false;
    334            }
    335            haveUniqueValue=true;
    336            node=*pos++;
    337        } else if(node<kMinValueLead) {
    338            // linear-match node
    339            pos+=node-kMinLinearMatch+1;  // Ignore the match units.
    340            node=*pos++;
    341        } else {
    342            UBool isFinal = static_cast<UBool>(node >> 15);
    343            int32_t value;
    344            if(isFinal) {
    345                value=readValue(pos, node&0x7fff);
    346            } else {
    347                value=readNodeValue(pos, node);
    348            }
    349            if(haveUniqueValue) {
    350                if(value!=uniqueValue) {
    351                    return false;
    352                }
    353            } else {
    354                uniqueValue=value;
    355                haveUniqueValue=true;
    356            }
    357            if(isFinal) {
    358                return true;
    359            }
    360            pos=skipNodeValue(pos, node);
    361            node&=kNodeTypeMask;
    362        }
    363    }
    364 }
    365 
    366 int32_t
    367 UCharsTrie::getNextUChars(Appendable &out) const {
    368    const char16_t *pos=pos_;
    369    if(pos==nullptr) {
    370        return 0;
    371    }
    372    if(remainingMatchLength_>=0) {
    373        out.appendCodeUnit(*pos);  // Next unit of a pending linear-match node.
    374        return 1;
    375    }
    376    int32_t node=*pos++;
    377    if(node>=kMinValueLead) {
    378        if(node&kValueIsFinal) {
    379            return 0;
    380        } else {
    381            pos=skipNodeValue(pos, node);
    382            node&=kNodeTypeMask;
    383        }
    384    }
    385    if(node<kMinLinearMatch) {
    386        if(node==0) {
    387            node=*pos++;
    388        }
    389        out.reserveAppendCapacity(++node);
    390        getNextBranchUChars(pos, node, out);
    391        return node;
    392    } else {
    393        // First unit of the linear-match node.
    394        out.appendCodeUnit(*pos);
    395        return 1;
    396    }
    397 }
    398 
    399 void
    400 UCharsTrie::getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out) {
    401    while(length>kMaxBranchLinearSubNodeLength) {
    402        ++pos;  // ignore the comparison unit
    403        getNextBranchUChars(jumpByDelta(pos), length>>1, out);
    404        length=length-(length>>1);
    405        pos=skipDelta(pos);
    406    }
    407    do {
    408        out.appendCodeUnit(*pos++);
    409        pos=skipValue(pos);
    410    } while(--length>1);
    411    out.appendCodeUnit(*pos);
    412 }
    413 
    414 U_NAMESPACE_END