tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ascii.cc (12813B)


      1 // Copyright 2017 The Abseil Authors.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "absl/strings/ascii.h"
     16 
     17 #include <climits>
     18 #include <cstddef>
     19 #include <cstring>
     20 #include <string>
     21 
     22 #include "absl/base/config.h"
     23 #include "absl/base/nullability.h"
     24 
     25 namespace absl {
     26 ABSL_NAMESPACE_BEGIN
     27 namespace ascii_internal {
     28 
     29 // # Table generated by this Python code (bit 0x02 is currently unused):
     30 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
     31 
     32 // NOTE: The kAsciiPropertyBits table used within this code was generated by
     33 // Python code of the following form. (Bit 0x02 is currently unused and
     34 // available.)
     35 //
     36 // def Hex2(n):
     37 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
     38 // def IsPunct(ch):
     39 //   return (ord(ch) >= 32 and ord(ch) < 127 and
     40 //           not ch.isspace() and not ch.isalnum())
     41 // def IsBlank(ch):
     42 //   return ch in ' \t'
     43 // def IsCntrl(ch):
     44 //   return ord(ch) < 32 or ord(ch) == 127
     45 // def IsXDigit(ch):
     46 //   return ch.isdigit() or ch.lower() in 'abcdef'
     47 // for i in range(128):
     48 //   ch = chr(i)
     49 //   mask = ((ch.isalpha() and 0x01 or 0) |
     50 //           (ch.isalnum() and 0x04 or 0) |
     51 //           (ch.isspace() and 0x08 or 0) |
     52 //           (IsPunct(ch) and 0x10 or 0) |
     53 //           (IsBlank(ch) and 0x20 or 0) |
     54 //           (IsCntrl(ch) and 0x40 or 0) |
     55 //           (IsXDigit(ch) and 0x80 or 0))
     56 //   print Hex2(mask) + ',',
     57 //   if i % 16 == 7:
     58 //     print ' //', Hex2(i & 0x78)
     59 //   elif i % 16 == 15:
     60 //     print
     61 
     62 // clang-format off
     63 // Array of bitfields holding character information. Each bit value corresponds
     64 // to a particular character feature. For readability, and because the value
     65 // of these bits is tightly coupled to this implementation, the individual bits
     66 // are not named. Note that bitfields for all characters above ASCII 127 are
     67 // zero-initialized.
     68 ABSL_DLL const unsigned char kPropertyBits[256] = {
     69    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
     70    0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
     71    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
     72    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
     73    0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
     74    0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
     75    0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
     76    0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
     77    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
     78    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
     79    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
     80    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
     81    0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
     82    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
     83    0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
     84    0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
     85 };
     86 
     87 // Array of characters for the ascii_tolower() function. For values 'A'
     88 // through 'Z', return the lower-case character; otherwise, return the
     89 // identity of the passed character.
     90 ABSL_DLL const char kToLower[256] = {
     91  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
     92  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
     93  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
     94  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
     95  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
     96  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
     97  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
     98  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
     99  '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
    100     'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
    101     'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
    102     'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
    103  '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
    104  '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
    105  '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
    106  '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
    107  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
    108  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
    109  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
    110  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
    111  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
    112  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
    113  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
    114  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
    115  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
    116  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
    117  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
    118  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
    119  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
    120  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
    121  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
    122  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
    123 };
    124 
    125 // Array of characters for the ascii_toupper() function. For values 'a'
    126 // through 'z', return the upper-case character; otherwise, return the
    127 // identity of the passed character.
    128 ABSL_DLL const char kToUpper[256] = {
    129  '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
    130  '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
    131  '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
    132  '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
    133  '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
    134  '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
    135  '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
    136  '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
    137  '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
    138  '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
    139  '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
    140  '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
    141  '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
    142     'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
    143     'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
    144     'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
    145  '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
    146  '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
    147  '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
    148  '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
    149  '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
    150  '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
    151  '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
    152  '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
    153  '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
    154  '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
    155  '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
    156  '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
    157  '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
    158  '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
    159  '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
    160  '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
    161 };
    162 // clang-format on
    163 
    164 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
    165 // Implemented by:
    166 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
    167 //  2. Comparing to SCHAR_MIN + 26.
    168 template <bool ToUpper>
    169 constexpr bool AsciiInAZRange(unsigned char c) {
    170  constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
    171  constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
    172  // Using unsigned arithmetic as overflows/underflows are well defined.
    173  unsigned char u = c - sub;
    174  // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
    175  return static_cast<signed char>(u) < threshold;
    176 }
    177 
    178 template <bool ToUpper>
    179 constexpr bool AsciiInAZRangeNaive(unsigned char c) {
    180  constexpr unsigned char a = (ToUpper ? 'a' : 'A');
    181  constexpr unsigned char z = (ToUpper ? 'z' : 'Z');
    182  return a <= c && c <= z;
    183 }
    184 
    185 template <bool ToUpper, bool Naive>
    186 constexpr void AsciiStrCaseFoldImpl(absl::Nonnull<char*> dst,
    187                                    absl::Nullable<const char*> src,
    188                                    size_t size) {
    189  // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
    190  // When we need to flip the case, we can xor with this bit to achieve the
    191  // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
    192  // could have chosen 'z' and 'Z', or any other pair of characters as they all
    193  // have the same single bit difference.
    194  constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
    195 
    196  for (size_t i = 0; i < size; ++i) {
    197    unsigned char v = static_cast<unsigned char>(src[i]);
    198    if ABSL_INTERNAL_CONSTEXPR_SINCE_CXX17 (Naive) {
    199      v ^= AsciiInAZRangeNaive<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
    200    } else {
    201      v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
    202    }
    203    dst[i] = static_cast<char>(v);
    204  }
    205 }
    206 
    207 // Splitting to short and long strings to allow vectorization decisions
    208 // to be made separately in the long and short cases.
    209 // Using slightly different implementations so the compiler won't optimize them
    210 // into the same code (the non-naive version is needed for SIMD, so for short
    211 // strings it's not important).
    212 // `src` may be null iff `size` is zero.
    213 template <bool ToUpper>
    214 constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst,
    215                                absl::Nullable<const char*> src, size_t size) {
    216  size < 16 ? AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/true>(dst, src, size)
    217            : AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/false>(dst, src, size);
    218 }
    219 
    220 void AsciiStrToLower(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
    221                     size_t n) {
    222  return AsciiStrCaseFold<false>(dst, src, n);
    223 }
    224 
    225 void AsciiStrToUpper(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
    226                     size_t n) {
    227  return AsciiStrCaseFold<true>(dst, src, n);
    228 }
    229 
    230 static constexpr size_t ValidateAsciiCasefold() {
    231  constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
    232  size_t incorrect_index = 0;
    233  char lowered[num_chars] = {};
    234  char uppered[num_chars] = {};
    235  for (unsigned int i = 0; i < num_chars; ++i) {
    236    uppered[i] = lowered[i] = static_cast<char>(i);
    237  }
    238  AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars);
    239  AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars);
    240  for (size_t i = 0; i < num_chars; ++i) {
    241    const char ch = static_cast<char>(i),
    242               ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
    243               ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
    244    if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
    245      incorrect_index = i > 0 ? i : num_chars;
    246      break;
    247    }
    248  }
    249  return incorrect_index;
    250 }
    251 
    252 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
    253 
    254 }  // namespace ascii_internal
    255 
    256 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
    257  char* p = &(*s)[0];
    258  return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size());
    259 }
    260 
    261 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
    262  char* p = &(*s)[0];
    263  return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size());
    264 }
    265 
    266 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
    267  auto stripped = StripAsciiWhitespace(*str);
    268 
    269  if (stripped.empty()) {
    270    str->clear();
    271    return;
    272  }
    273 
    274  auto input_it = stripped.begin();
    275  auto input_end = stripped.end();
    276  auto output_it = &(*str)[0];
    277  bool is_ws = false;
    278 
    279  for (; input_it < input_end; ++input_it) {
    280    if (is_ws) {
    281      // Consecutive whitespace?  Keep only the last.
    282      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
    283      if (is_ws) --output_it;
    284    } else {
    285      is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
    286    }
    287 
    288    *output_it = *input_it;
    289    ++output_it;
    290  }
    291 
    292  str->erase(static_cast<size_t>(output_it - &(*str)[0]));
    293 }
    294 
    295 ABSL_NAMESPACE_END
    296 }  // namespace absl