tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

util_props.cpp (7054B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/19/2001  aliu        Creation.
     10 **********************************************************************
     11 */
     12 
     13 #include "unicode/uchar.h"
     14 #include "unicode/utf16.h"
     15 #include "patternprops.h"
     16 #include "util.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 /**
     21 * Parse an integer at pos, either of the form \d+ or of the form
     22 * 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
     23 * or octal format.
     24 * @param pos INPUT-OUTPUT parameter.  On input, the first
     25 * character to parse.  On output, the character after the last
     26 * parsed character.
     27 */
     28 int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
     29    int32_t count = 0;
     30    int32_t value = 0;
     31    int32_t p = pos;
     32    int8_t radix = 10;
     33 
     34    if (p < limit && rule.charAt(p) == 48 /*0*/) {
     35        if (p+1 < limit && (rule.charAt(p+1) == 0x78 /*x*/ || rule.charAt(p+1) == 0x58 /*X*/)) {
     36            p += 2;
     37            radix = 16;
     38        }
     39        else {
     40            p++;
     41            count = 1;
     42            radix = 8;
     43        }
     44    }
     45 
     46    while (p < limit) {
     47        int32_t d = u_digit(rule.charAt(p++), radix);
     48        if (d < 0) {
     49            --p;
     50            break;
     51        }
     52        ++count;
     53        int32_t v = (value * radix) + d;
     54        if (v <= value) {
     55            // If there are too many input digits, at some point
     56            // the value will go negative, e.g., if we have seen
     57            // "0x8000000" already and there is another '0', when
     58            // we parse the next 0 the value will go negative.
     59            return 0;
     60        }
     61        value = v;
     62    }
     63    if (count > 0) {
     64        pos = p;
     65    }
     66    return value;
     67 }
     68 
     69 /**
     70 * Parse a pattern string starting at offset pos.  Keywords are
     71 * matched case-insensitively.  Spaces may be skipped and may be
     72 * optional or required.  Integer values may be parsed, and if
     73 * they are, they will be returned in the given array.  If
     74 * successful, the offset of the next non-space character is
     75 * returned.  On failure, -1 is returned.
     76 * @param pattern must only contain lowercase characters, which
     77 * will match their uppercase equivalents as well.  A space
     78 * character matches one or more required spaces.  A '~' character
     79 * matches zero or more optional spaces.  A '#' character matches
     80 * an integer and stores it in parsedInts, which the caller must
     81 * ensure has enough capacity.
     82 * @param parsedInts array to receive parsed integers.  Caller
     83 * must ensure that parsedInts.length is >= the number of '#'
     84 * signs in 'pattern'.
     85 * @return the position after the last character parsed, or -1 if
     86 * the parse failed
     87 */
     88 int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
     89                              const UnicodeString& pattern, int32_t* parsedInts) {
     90    // TODO Update this to handle surrogates
     91    int32_t p;
     92    int32_t intCount = 0; // number of integers parsed
     93    for (int32_t i=0; i<pattern.length(); ++i) {
     94        char16_t cpat = pattern.charAt(i);
     95        char16_t c;
     96        switch (cpat) {
     97        case 32 /*' '*/:
     98            if (pos >= limit) {
     99                return -1;
    100            }
    101            c = rule.charAt(pos++);
    102            if (!PatternProps::isWhiteSpace(c)) {
    103                return -1;
    104            }
    105            // FALL THROUGH to skipWhitespace
    106            U_FALLTHROUGH;
    107        case 126 /*'~'*/:
    108            pos = skipWhitespace(rule, pos);
    109            break;
    110        case 35 /*'#'*/:
    111            p = pos;
    112            parsedInts[intCount++] = parseInteger(rule, p, limit);
    113            if (p == pos) {
    114                // Syntax error; failed to parse integer
    115                return -1;
    116            }
    117            pos = p;
    118            break;
    119        default:
    120            if (pos >= limit) {
    121                return -1;
    122            }
    123            c = static_cast<char16_t>(u_tolower(rule.charAt(pos++)));
    124            if (c != cpat) {
    125                return -1;
    126            }
    127            break;
    128        }
    129    }
    130    return pos;
    131 }
    132 
    133 /**
    134 * Parse a Unicode identifier from the given string at the given
    135 * position.  Return the identifier, or an empty string if there
    136 * is no identifier.
    137 * @param str the string to parse
    138 * @param pos INPUT-OUTPUT parameter.  On INPUT, pos is the
    139 * first character to examine.  It must be less than str.length(),
    140 * and it must not point to a whitespace character.  That is, must
    141 * have pos < str.length().  On
    142 * OUTPUT, the position after the last parsed character.
    143 * @return the Unicode identifier, or an empty string if there is
    144 * no valid identifier at pos.
    145 */
    146 UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
    147    // assert(pos < str.length());
    148    UnicodeString buf;
    149    int p = pos;
    150    while (p < str.length()) {
    151        UChar32 ch = str.char32At(p);
    152        if (buf.length() == 0) {
    153            if (u_isIDStart(ch)) {
    154                buf.append(ch);
    155            } else {
    156                buf.truncate(0);
    157                return buf;
    158            }
    159        } else {
    160            if (u_isIDPart(ch)) {
    161                buf.append(ch);
    162            } else {
    163                break;
    164            }
    165        }
    166        p += U16_LENGTH(ch);
    167    }
    168    pos = p;
    169    return buf;
    170 }
    171 
    172 /**
    173 * Parse an unsigned 31-bit integer at the given offset.  Use
    174 * UCharacter.digit() to parse individual characters into digits.
    175 * @param text the text to be parsed
    176 * @param pos INPUT-OUTPUT parameter.  On entry, pos[0] is the
    177 * offset within text at which to start parsing; it should point
    178 * to a valid digit.  On exit, pos[0] is the offset after the last
    179 * parsed character.  If the parse failed, it will be unchanged on
    180 * exit.  Must be >= 0 on entry.
    181 * @param radix the radix in which to parse; must be >= 2 and <=
    182 * 36.
    183 * @return a non-negative parsed number, or -1 upon parse failure.
    184 * Parse fails if there are no digits, that is, if pos[0] does not
    185 * point to a valid digit on entry, or if the number to be parsed
    186 * does not fit into a 31-bit unsigned integer.
    187 */
    188 int32_t ICU_Utility::parseNumber(const UnicodeString& text,
    189                                 int32_t& pos, int8_t radix) {
    190    // assert(pos[0] >= 0);
    191    // assert(radix >= 2);
    192    // assert(radix <= 36);
    193    int32_t n = 0;
    194    int32_t p = pos;
    195    while (p < text.length()) {
    196        UChar32 ch = text.char32At(p);
    197        int32_t d = u_digit(ch, radix);
    198        if (d < 0) {
    199            break;
    200        }
    201        int64_t update = radix*static_cast<int64_t>(n) + d;
    202        if (update > INT32_MAX) {
    203            return -1;
    204        }
    205        n = static_cast<int32_t>(update);
    206        ++p;
    207    }
    208    if (p == pos) {
    209        return -1;
    210    }
    211    pos = p;
    212    return n;
    213 }
    214 
    215 U_NAMESPACE_END