escaping.cc (39739B)
1 // Copyright 2017 The Abseil Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/strings/escaping.h" 16 17 #include <algorithm> 18 #include <array> 19 #include <cassert> 20 #include <cstddef> 21 #include <cstdint> 22 #include <cstring> 23 #include <limits> 24 #include <string> 25 #include <utility> 26 27 #include "absl/base/config.h" 28 #include "absl/base/internal/endian.h" 29 #include "absl/base/internal/raw_logging.h" 30 #include "absl/base/internal/unaligned_access.h" 31 #include "absl/base/nullability.h" 32 #include "absl/strings/ascii.h" 33 #include "absl/strings/charset.h" 34 #include "absl/strings/internal/escaping.h" 35 #include "absl/strings/internal/resize_uninitialized.h" 36 #include "absl/strings/internal/utf8.h" 37 #include "absl/strings/numbers.h" 38 #include "absl/strings/str_cat.h" 39 #include "absl/strings/string_view.h" 40 41 namespace absl { 42 ABSL_NAMESPACE_BEGIN 43 namespace { 44 45 // These are used for the leave_nulls_escaped argument to CUnescapeInternal(). 46 constexpr bool kUnescapeNulls = false; 47 48 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); } 49 50 inline unsigned int hex_digit_to_int(char c) { 51 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61, 52 "Character set must be ASCII."); 53 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c))); 54 unsigned int x = static_cast<unsigned char>(c); 55 if (x > '9') { 56 x += 9; 57 } 58 return x & 0xf; 59 } 60 61 inline bool IsSurrogate(char32_t c, absl::string_view src, 62 absl::Nullable<std::string*> error) { 63 if (c >= 0xD800 && c <= 0xDFFF) { 64 if (error) { 65 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\", 66 src); 67 } 68 return true; 69 } 70 return false; 71 } 72 73 // ---------------------------------------------------------------------- 74 // CUnescapeInternal() 75 // Implements both CUnescape() and CUnescapeForNullTerminatedString(). 76 // 77 // Unescapes C escape sequences and is the reverse of CEscape(). 78 // 79 // If 'source' is valid, stores the unescaped string and its size in 80 // 'dest' and 'dest_len' respectively, and returns true. Otherwise 81 // returns false and optionally stores the error description in 82 // 'error'. Set 'error' to nullptr to disable error reporting. 83 // 84 // 'dest' should point to a buffer that is at least as big as 'source'. 85 // 'source' and 'dest' may be the same. 86 // 87 // NOTE: any changes to this function must also be reflected in the older 88 // UnescapeCEscapeSequences(). 89 // ---------------------------------------------------------------------- 90 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped, 91 absl::Nonnull<char*> dest, 92 absl::Nonnull<ptrdiff_t*> dest_len, 93 absl::Nullable<std::string*> error) { 94 char* d = dest; 95 const char* p = source.data(); 96 const char* end = p + source.size(); 97 const char* last_byte = end - 1; 98 99 // Small optimization for case where source = dest and there's no escaping 100 while (p == d && p < end && *p != '\\') p++, d++; 101 102 while (p < end) { 103 if (*p != '\\') { 104 *d++ = *p++; 105 } else { 106 if (++p > last_byte) { // skip past the '\\' 107 if (error) *error = "String cannot end with \\"; 108 return false; 109 } 110 switch (*p) { 111 case 'a': *d++ = '\a'; break; 112 case 'b': *d++ = '\b'; break; 113 case 'f': *d++ = '\f'; break; 114 case 'n': *d++ = '\n'; break; 115 case 'r': *d++ = '\r'; break; 116 case 't': *d++ = '\t'; break; 117 case 'v': *d++ = '\v'; break; 118 case '\\': *d++ = '\\'; break; 119 case '?': *d++ = '\?'; break; // \? Who knew? 120 case '\'': *d++ = '\''; break; 121 case '"': *d++ = '\"'; break; 122 case '0': 123 case '1': 124 case '2': 125 case '3': 126 case '4': 127 case '5': 128 case '6': 129 case '7': { 130 // octal digit: 1 to 3 digits 131 const char* octal_start = p; 132 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1 133 if (p < last_byte && is_octal_digit(p[1])) 134 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2 135 if (p < last_byte && is_octal_digit(p[1])) 136 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3 137 if (ch > 0xff) { 138 if (error) { 139 *error = "Value of \\" + 140 std::string(octal_start, 141 static_cast<size_t>(p + 1 - octal_start)) + 142 " exceeds 0xff"; 143 } 144 return false; 145 } 146 if ((ch == 0) && leave_nulls_escaped) { 147 // Copy the escape sequence for the null character 148 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start); 149 *d++ = '\\'; 150 memmove(d, octal_start, octal_size); 151 d += octal_size; 152 break; 153 } 154 *d++ = static_cast<char>(ch); 155 break; 156 } 157 case 'x': 158 case 'X': { 159 if (p >= last_byte) { 160 if (error) *error = "String cannot end with \\x"; 161 return false; 162 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) { 163 if (error) *error = "\\x cannot be followed by a non-hex digit"; 164 return false; 165 } 166 unsigned int ch = 0; 167 const char* hex_start = p; 168 while (p < last_byte && 169 absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) 170 // Arbitrarily many hex digits 171 ch = (ch << 4) + hex_digit_to_int(*++p); 172 if (ch > 0xFF) { 173 if (error) { 174 *error = "Value of \\" + 175 std::string(hex_start, 176 static_cast<size_t>(p + 1 - hex_start)) + 177 " exceeds 0xff"; 178 } 179 return false; 180 } 181 if ((ch == 0) && leave_nulls_escaped) { 182 // Copy the escape sequence for the null character 183 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start); 184 *d++ = '\\'; 185 memmove(d, hex_start, hex_size); 186 d += hex_size; 187 break; 188 } 189 *d++ = static_cast<char>(ch); 190 break; 191 } 192 case 'u': { 193 // \uhhhh => convert 4 hex digits to UTF-8 194 char32_t rune = 0; 195 const char* hex_start = p; 196 if (p + 4 >= end) { 197 if (error) { 198 *error = "\\u must be followed by 4 hex digits: \\" + 199 std::string(hex_start, 200 static_cast<size_t>(p + 1 - hex_start)); 201 } 202 return false; 203 } 204 for (int i = 0; i < 4; ++i) { 205 // Look one char ahead. 206 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) { 207 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. 208 } else { 209 if (error) { 210 *error = "\\u must be followed by 4 hex digits: \\" + 211 std::string(hex_start, 212 static_cast<size_t>(p + 1 - hex_start)); 213 } 214 return false; 215 } 216 } 217 if ((rune == 0) && leave_nulls_escaped) { 218 // Copy the escape sequence for the null character 219 *d++ = '\\'; 220 memmove(d, hex_start, 5); // u0000 221 d += 5; 222 break; 223 } 224 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) { 225 return false; 226 } 227 d += strings_internal::EncodeUTF8Char(d, rune); 228 break; 229 } 230 case 'U': { 231 // \Uhhhhhhhh => convert 8 hex digits to UTF-8 232 char32_t rune = 0; 233 const char* hex_start = p; 234 if (p + 8 >= end) { 235 if (error) { 236 *error = "\\U must be followed by 8 hex digits: \\" + 237 std::string(hex_start, 238 static_cast<size_t>(p + 1 - hex_start)); 239 } 240 return false; 241 } 242 for (int i = 0; i < 8; ++i) { 243 // Look one char ahead. 244 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) { 245 // Don't change rune until we're sure this 246 // is within the Unicode limit, but do advance p. 247 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p); 248 if (newrune > 0x10FFFF) { 249 if (error) { 250 *error = "Value of \\" + 251 std::string(hex_start, 252 static_cast<size_t>(p + 1 - hex_start)) + 253 " exceeds Unicode limit (0x10FFFF)"; 254 } 255 return false; 256 } else { 257 rune = newrune; 258 } 259 } else { 260 if (error) { 261 *error = "\\U must be followed by 8 hex digits: \\" + 262 std::string(hex_start, 263 static_cast<size_t>(p + 1 - hex_start)); 264 } 265 return false; 266 } 267 } 268 if ((rune == 0) && leave_nulls_escaped) { 269 // Copy the escape sequence for the null character 270 *d++ = '\\'; 271 memmove(d, hex_start, 9); // U00000000 272 d += 9; 273 break; 274 } 275 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) { 276 return false; 277 } 278 d += strings_internal::EncodeUTF8Char(d, rune); 279 break; 280 } 281 default: { 282 if (error) *error = std::string("Unknown escape sequence: \\") + *p; 283 return false; 284 } 285 } 286 p++; // read past letter we escaped 287 } 288 } 289 *dest_len = d - dest; 290 return true; 291 } 292 293 // ---------------------------------------------------------------------- 294 // CUnescapeInternal() 295 // 296 // Same as above but uses a std::string for output. 'source' and 'dest' 297 // may be the same. 298 // ---------------------------------------------------------------------- 299 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped, 300 absl::Nonnull<std::string*> dest, 301 absl::Nullable<std::string*> error) { 302 strings_internal::STLStringResizeUninitialized(dest, source.size()); 303 304 ptrdiff_t dest_size; 305 if (!CUnescapeInternal(source, 306 leave_nulls_escaped, 307 &(*dest)[0], 308 &dest_size, 309 error)) { 310 return false; 311 } 312 dest->erase(static_cast<size_t>(dest_size)); 313 return true; 314 } 315 316 // ---------------------------------------------------------------------- 317 // CEscape() 318 // CHexEscape() 319 // Utf8SafeCEscape() 320 // Utf8SafeCHexEscape() 321 // Escapes 'src' using C-style escape sequences. This is useful for 322 // preparing query flags. The 'Hex' version uses hexadecimal rather than 323 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes. 324 // 325 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint(). 326 // ---------------------------------------------------------------------- 327 std::string CEscapeInternal(absl::string_view src, bool use_hex, 328 bool utf8_safe) { 329 std::string dest; 330 bool last_hex_escape = false; // true if last output char was \xNN. 331 332 for (char c : src) { 333 bool is_hex_escape = false; 334 switch (c) { 335 case '\n': dest.append("\\" "n"); break; 336 case '\r': dest.append("\\" "r"); break; 337 case '\t': dest.append("\\" "t"); break; 338 case '\"': dest.append("\\" "\""); break; 339 case '\'': dest.append("\\" "'"); break; 340 case '\\': dest.append("\\" "\\"); break; 341 default: { 342 // Note that if we emit \xNN and the src character after that is a hex 343 // digit then that digit must be escaped too to prevent it being 344 // interpreted as part of the character code by C. 345 const unsigned char uc = static_cast<unsigned char>(c); 346 if ((!utf8_safe || uc < 0x80) && 347 (!absl::ascii_isprint(uc) || 348 (last_hex_escape && absl::ascii_isxdigit(uc)))) { 349 if (use_hex) { 350 dest.append("\\" "x"); 351 dest.push_back(numbers_internal::kHexChar[uc / 16]); 352 dest.push_back(numbers_internal::kHexChar[uc % 16]); 353 is_hex_escape = true; 354 } else { 355 dest.append("\\"); 356 dest.push_back(numbers_internal::kHexChar[uc / 64]); 357 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]); 358 dest.push_back(numbers_internal::kHexChar[uc % 8]); 359 } 360 } else { 361 dest.push_back(c); 362 break; 363 } 364 } 365 } 366 last_hex_escape = is_hex_escape; 367 } 368 369 return dest; 370 } 371 372 /* clang-format off */ 373 constexpr std::array<unsigned char, 256> kCEscapedLen = { 374 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r 375 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 376 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", ' 377 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9' 378 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O' 379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\' 380 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o' 381 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL 382 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 383 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 384 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 385 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 386 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 387 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 388 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 389 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 390 }; 391 /* clang-format on */ 392 393 constexpr uint32_t MakeCEscapedLittleEndianUint32(size_t c) { 394 size_t char_len = kCEscapedLen[c]; 395 if (char_len == 1) { 396 return static_cast<uint32_t>(c); 397 } 398 if (char_len == 2) { 399 switch (c) { 400 case '\n': 401 return '\\' | (static_cast<uint32_t>('n') << 8); 402 case '\r': 403 return '\\' | (static_cast<uint32_t>('r') << 8); 404 case '\t': 405 return '\\' | (static_cast<uint32_t>('t') << 8); 406 case '\"': 407 return '\\' | (static_cast<uint32_t>('\"') << 8); 408 case '\'': 409 return '\\' | (static_cast<uint32_t>('\'') << 8); 410 case '\\': 411 return '\\' | (static_cast<uint32_t>('\\') << 8); 412 } 413 } 414 return static_cast<uint32_t>('\\' | (('0' + (c / 64)) << 8) | 415 (('0' + ((c % 64) / 8)) << 16) | 416 (('0' + (c % 8)) << 24)); 417 } 418 419 template <size_t... indexes> 420 inline constexpr std::array<uint32_t, sizeof...(indexes)> 421 MakeCEscapedLittleEndianUint32Array(std::index_sequence<indexes...>) { 422 return {MakeCEscapedLittleEndianUint32(indexes)...}; 423 } 424 constexpr std::array<uint32_t, 256> kCEscapedLittleEndianUint32Array = 425 MakeCEscapedLittleEndianUint32Array(std::make_index_sequence<256>()); 426 427 // Calculates the length of the C-style escaped version of 'src'. 428 // Assumes that non-printable characters are escaped using octal sequences, and 429 // that UTF-8 bytes are not handled specially. 430 inline size_t CEscapedLength(absl::string_view src) { 431 size_t escaped_len = 0; 432 // The maximum value of kCEscapedLen[x] is 4, so we can escape any string of 433 // length size_t_max/4 without checking for overflow. 434 size_t unchecked_limit = 435 std::min<size_t>(src.size(), std::numeric_limits<size_t>::max() / 4); 436 size_t i = 0; 437 while (i < unchecked_limit) { 438 // Common case: No need to check for overflow. 439 escaped_len += kCEscapedLen[static_cast<unsigned char>(src[i++])]; 440 } 441 while (i < src.size()) { 442 // Beyond unchecked_limit we need to check for overflow before adding. 443 size_t char_len = kCEscapedLen[static_cast<unsigned char>(src[i++])]; 444 ABSL_INTERNAL_CHECK( 445 escaped_len <= std::numeric_limits<size_t>::max() - char_len, 446 "escaped_len overflow"); 447 escaped_len += char_len; 448 } 449 return escaped_len; 450 } 451 452 void CEscapeAndAppendInternal(absl::string_view src, 453 absl::Nonnull<std::string*> dest) { 454 size_t escaped_len = CEscapedLength(src); 455 if (escaped_len == src.size()) { 456 dest->append(src.data(), src.size()); 457 return; 458 } 459 460 // We keep 3 slop bytes so that we can call `little_endian::Store32` 461 // invariably regardless of the length of the escaped character. 462 constexpr size_t slop_bytes = 3; 463 size_t cur_dest_len = dest->size(); 464 size_t new_dest_len = cur_dest_len + escaped_len + slop_bytes; 465 ABSL_INTERNAL_CHECK(new_dest_len > cur_dest_len, "std::string size overflow"); 466 strings_internal::AppendUninitializedTraits<std::string>::Append( 467 dest, escaped_len + slop_bytes); 468 char* append_ptr = &(*dest)[cur_dest_len]; 469 470 for (char c : src) { 471 unsigned char uc = static_cast<unsigned char>(c); 472 size_t char_len = kCEscapedLen[uc]; 473 uint32_t little_endian_uint32 = kCEscapedLittleEndianUint32Array[uc]; 474 little_endian::Store32(append_ptr, little_endian_uint32); 475 append_ptr += char_len; 476 } 477 dest->resize(new_dest_len - slop_bytes); 478 } 479 480 // Reverses the mapping in Base64EscapeInternal; see that method's 481 // documentation for details of the mapping. 482 bool Base64UnescapeInternal(absl::Nullable<const char*> src_param, size_t szsrc, 483 absl::Nullable<char*> dest, size_t szdest, 484 const std::array<signed char, 256>& unbase64, 485 absl::Nonnull<size_t*> len) { 486 static const char kPad64Equals = '='; 487 static const char kPad64Dot = '.'; 488 489 size_t destidx = 0; 490 int decode = 0; 491 int state = 0; 492 unsigned char ch = 0; 493 unsigned int temp = 0; 494 495 // If "char" is signed by default, using *src as an array index results in 496 // accessing negative array elements. Treat the input as a pointer to 497 // unsigned char to avoid this. 498 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param); 499 500 // The GET_INPUT macro gets the next input character, skipping 501 // over any whitespace, and stopping when we reach the end of the 502 // string or when we read any non-data character. The arguments are 503 // an arbitrary identifier (used as a label for goto) and the number 504 // of data bytes that must remain in the input to avoid aborting the 505 // loop. 506 #define GET_INPUT(label, remain) \ 507 label: \ 508 --szsrc; \ 509 ch = *src++; \ 510 decode = unbase64[ch]; \ 511 if (decode < 0) { \ 512 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \ 513 state = 4 - remain; \ 514 break; \ 515 } 516 517 // if dest is null, we're just checking to see if it's legal input 518 // rather than producing output. (I suspect this could just be done 519 // with a regexp...). We duplicate the loop so this test can be 520 // outside it instead of in every iteration. 521 522 if (dest) { 523 // This loop consumes 4 input bytes and produces 3 output bytes 524 // per iteration. We can't know at the start that there is enough 525 // data left in the string for a full iteration, so the loop may 526 // break out in the middle; if so 'state' will be set to the 527 // number of input bytes read. 528 529 while (szsrc >= 4) { 530 // We'll start by optimistically assuming that the next four 531 // bytes of the string (src[0..3]) are four good data bytes 532 // (that is, no nulls, whitespace, padding chars, or illegal 533 // chars). We need to test src[0..2] for nulls individually 534 // before constructing temp to preserve the property that we 535 // never read past a null in the string (no matter how long 536 // szsrc claims the string is). 537 538 if (!src[0] || !src[1] || !src[2] || 539 ((temp = ((unsigned(unbase64[src[0]]) << 18) | 540 (unsigned(unbase64[src[1]]) << 12) | 541 (unsigned(unbase64[src[2]]) << 6) | 542 (unsigned(unbase64[src[3]])))) & 543 0x80000000)) { 544 // Iff any of those four characters was bad (null, illegal, 545 // whitespace, padding), then temp's high bit will be set 546 // (because unbase64[] is -1 for all bad characters). 547 // 548 // We'll back up and resort to the slower decoder, which knows 549 // how to handle those cases. 550 551 GET_INPUT(first, 4); 552 temp = static_cast<unsigned char>(decode); 553 GET_INPUT(second, 3); 554 temp = (temp << 6) | static_cast<unsigned char>(decode); 555 GET_INPUT(third, 2); 556 temp = (temp << 6) | static_cast<unsigned char>(decode); 557 GET_INPUT(fourth, 1); 558 temp = (temp << 6) | static_cast<unsigned char>(decode); 559 } else { 560 // We really did have four good data bytes, so advance four 561 // characters in the string. 562 563 szsrc -= 4; 564 src += 4; 565 } 566 567 // temp has 24 bits of input, so write that out as three bytes. 568 569 if (destidx + 3 > szdest) return false; 570 dest[destidx + 2] = static_cast<char>(temp); 571 temp >>= 8; 572 dest[destidx + 1] = static_cast<char>(temp); 573 temp >>= 8; 574 dest[destidx] = static_cast<char>(temp); 575 destidx += 3; 576 } 577 } else { 578 while (szsrc >= 4) { 579 if (!src[0] || !src[1] || !src[2] || 580 ((temp = ((unsigned(unbase64[src[0]]) << 18) | 581 (unsigned(unbase64[src[1]]) << 12) | 582 (unsigned(unbase64[src[2]]) << 6) | 583 (unsigned(unbase64[src[3]])))) & 584 0x80000000)) { 585 GET_INPUT(first_no_dest, 4); 586 GET_INPUT(second_no_dest, 3); 587 GET_INPUT(third_no_dest, 2); 588 GET_INPUT(fourth_no_dest, 1); 589 } else { 590 szsrc -= 4; 591 src += 4; 592 } 593 destidx += 3; 594 } 595 } 596 597 #undef GET_INPUT 598 599 // if the loop terminated because we read a bad character, return 600 // now. 601 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot && 602 !absl::ascii_isspace(ch)) 603 return false; 604 605 if (ch == kPad64Equals || ch == kPad64Dot) { 606 // if we stopped by hitting an '=' or '.', un-read that character -- we'll 607 // look at it again when we count to check for the proper number of 608 // equals signs at the end. 609 ++szsrc; 610 --src; 611 } else { 612 // This loop consumes 1 input byte per iteration. It's used to 613 // clean up the 0-3 input bytes remaining when the first, faster 614 // loop finishes. 'temp' contains the data from 'state' input 615 // characters read by the first loop. 616 while (szsrc > 0) { 617 --szsrc; 618 ch = *src++; 619 decode = unbase64[ch]; 620 if (decode < 0) { 621 if (absl::ascii_isspace(ch)) { 622 continue; 623 } else if (ch == kPad64Equals || ch == kPad64Dot) { 624 // back up one character; we'll read it again when we check 625 // for the correct number of pad characters at the end. 626 ++szsrc; 627 --src; 628 break; 629 } else { 630 return false; 631 } 632 } 633 634 // Each input character gives us six bits of output. 635 temp = (temp << 6) | static_cast<unsigned char>(decode); 636 ++state; 637 if (state == 4) { 638 // If we've accumulated 24 bits of output, write that out as 639 // three bytes. 640 if (dest) { 641 if (destidx + 3 > szdest) return false; 642 dest[destidx + 2] = static_cast<char>(temp); 643 temp >>= 8; 644 dest[destidx + 1] = static_cast<char>(temp); 645 temp >>= 8; 646 dest[destidx] = static_cast<char>(temp); 647 } 648 destidx += 3; 649 state = 0; 650 temp = 0; 651 } 652 } 653 } 654 655 // Process the leftover data contained in 'temp' at the end of the input. 656 int expected_equals = 0; 657 switch (state) { 658 case 0: 659 // Nothing left over; output is a multiple of 3 bytes. 660 break; 661 662 case 1: 663 // Bad input; we have 6 bits left over. 664 return false; 665 666 case 2: 667 // Produce one more output byte from the 12 input bits we have left. 668 if (dest) { 669 if (destidx + 1 > szdest) return false; 670 temp >>= 4; 671 dest[destidx] = static_cast<char>(temp); 672 } 673 ++destidx; 674 expected_equals = 2; 675 break; 676 677 case 3: 678 // Produce two more output bytes from the 18 input bits we have left. 679 if (dest) { 680 if (destidx + 2 > szdest) return false; 681 temp >>= 2; 682 dest[destidx + 1] = static_cast<char>(temp); 683 temp >>= 8; 684 dest[destidx] = static_cast<char>(temp); 685 } 686 destidx += 2; 687 expected_equals = 1; 688 break; 689 690 default: 691 // state should have no other values at this point. 692 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d", 693 state); 694 } 695 696 // The remainder of the string should be all whitespace, mixed with 697 // exactly 0 equals signs, or exactly 'expected_equals' equals 698 // signs. (Always accepting 0 equals signs is an Abseil extension 699 // not covered in the RFC, as is accepting dot as the pad character.) 700 701 int equals = 0; 702 while (szsrc > 0) { 703 if (*src == kPad64Equals || *src == kPad64Dot) 704 ++equals; 705 else if (!absl::ascii_isspace(*src)) 706 return false; 707 --szsrc; 708 ++src; 709 } 710 711 const bool ok = (equals == 0 || equals == expected_equals); 712 if (ok) *len = destidx; 713 return ok; 714 } 715 716 // The arrays below map base64-escaped characters back to their original values. 717 // For the inverse case, see k(WebSafe)Base64Chars in the internal 718 // escaping.cc. 719 // These arrays were generated by the following inversion code: 720 // #include <sys/time.h> 721 // #include <stdlib.h> 722 // #include <string.h> 723 // main() 724 // { 725 // static const char Base64[] = 726 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 727 // char* pos; 728 // int idx, i, j; 729 // printf(" "); 730 // for (i = 0; i < 255; i += 8) { 731 // for (j = i; j < i + 8; j++) { 732 // pos = strchr(Base64, j); 733 // if ((pos == nullptr) || (j == 0)) 734 // idx = -1; 735 // else 736 // idx = pos - Base64; 737 // if (idx == -1) 738 // printf(" %2d, ", idx); 739 // else 740 // printf(" %2d/*%c*/,", idx, j); 741 // } 742 // printf("\n "); 743 // } 744 // } 745 // 746 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars 747 // in the internal escaping.cc. 748 /* clang-format off */ 749 constexpr std::array<signed char, 256> kUnBase64 = { 750 -1, -1, -1, -1, -1, -1, -1, -1, 751 -1, -1, -1, -1, -1, -1, -1, -1, 752 -1, -1, -1, -1, -1, -1, -1, -1, 753 -1, -1, -1, -1, -1, -1, -1, -1, 754 -1, -1, -1, -1, -1, -1, -1, -1, 755 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */, 756 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, 757 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, 758 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, 759 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, 760 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, 761 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1, 762 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, 763 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, 764 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, 765 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, 766 -1, -1, -1, -1, -1, -1, -1, -1, 767 -1, -1, -1, -1, -1, -1, -1, -1, 768 -1, -1, -1, -1, -1, -1, -1, -1, 769 -1, -1, -1, -1, -1, -1, -1, -1, 770 -1, -1, -1, -1, -1, -1, -1, -1, 771 -1, -1, -1, -1, -1, -1, -1, -1, 772 -1, -1, -1, -1, -1, -1, -1, -1, 773 -1, -1, -1, -1, -1, -1, -1, -1, 774 -1, -1, -1, -1, -1, -1, -1, -1, 775 -1, -1, -1, -1, -1, -1, -1, -1, 776 -1, -1, -1, -1, -1, -1, -1, -1, 777 -1, -1, -1, -1, -1, -1, -1, -1, 778 -1, -1, -1, -1, -1, -1, -1, -1, 779 -1, -1, -1, -1, -1, -1, -1, -1, 780 -1, -1, -1, -1, -1, -1, -1, -1, 781 -1, -1, -1, -1, -1, -1, -1, -1 782 }; 783 784 constexpr std::array<signed char, 256> kUnWebSafeBase64 = { 785 -1, -1, -1, -1, -1, -1, -1, -1, 786 -1, -1, -1, -1, -1, -1, -1, -1, 787 -1, -1, -1, -1, -1, -1, -1, -1, 788 -1, -1, -1, -1, -1, -1, -1, -1, 789 -1, -1, -1, -1, -1, -1, -1, -1, 790 -1, -1, -1, -1, -1, 62/*-*/, -1, -1, 791 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, 792 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, 793 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, 794 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, 795 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, 796 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/, 797 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, 798 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, 799 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, 800 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, 801 -1, -1, -1, -1, -1, -1, -1, -1, 802 -1, -1, -1, -1, -1, -1, -1, -1, 803 -1, -1, -1, -1, -1, -1, -1, -1, 804 -1, -1, -1, -1, -1, -1, -1, -1, 805 -1, -1, -1, -1, -1, -1, -1, -1, 806 -1, -1, -1, -1, -1, -1, -1, -1, 807 -1, -1, -1, -1, -1, -1, -1, -1, 808 -1, -1, -1, -1, -1, -1, -1, -1, 809 -1, -1, -1, -1, -1, -1, -1, -1, 810 -1, -1, -1, -1, -1, -1, -1, -1, 811 -1, -1, -1, -1, -1, -1, -1, -1, 812 -1, -1, -1, -1, -1, -1, -1, -1, 813 -1, -1, -1, -1, -1, -1, -1, -1, 814 -1, -1, -1, -1, -1, -1, -1, -1, 815 -1, -1, -1, -1, -1, -1, -1, -1, 816 -1, -1, -1, -1, -1, -1, -1, -1 817 }; 818 /* clang-format on */ 819 820 template <typename String> 821 bool Base64UnescapeInternal(absl::Nullable<const char*> src, size_t slen, 822 absl::Nonnull<String*> dest, 823 const std::array<signed char, 256>& unbase64) { 824 // Determine the size of the output string. Base64 encodes every 3 bytes into 825 // 4 characters. Any leftover chars are added directly for good measure. 826 const size_t dest_len = 3 * (slen / 4) + (slen % 4); 827 828 strings_internal::STLStringResizeUninitialized(dest, dest_len); 829 830 // We are getting the destination buffer by getting the beginning of the 831 // string and converting it into a char *. 832 size_t len; 833 const bool ok = 834 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len); 835 if (!ok) { 836 dest->clear(); 837 return false; 838 } 839 840 // could be shorter if there was padding 841 assert(len <= dest_len); 842 dest->erase(len); 843 844 return true; 845 } 846 847 /* clang-format off */ 848 constexpr std::array<char, 256> kHexValueLenient = { 849 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 850 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 851 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 852 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9' 853 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F' 854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 855 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f' 856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 858 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 860 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 865 }; 866 867 constexpr std::array<signed char, 256> kHexValueStrict = { 868 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 869 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 870 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 871 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // '0'..'9' 872 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'A'..'F' 873 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 874 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 'a'..'f' 875 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 876 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 877 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 878 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 879 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 880 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 881 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 882 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 883 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 884 }; 885 /* clang-format on */ 886 887 // This is a templated function so that T can be either a char* 888 // or a string. This works because we use the [] operator to access 889 // individual characters at a time. 890 template <typename T> 891 void HexStringToBytesInternal(absl::Nullable<const char*> from, T to, 892 size_t num) { 893 for (size_t i = 0; i < num; i++) { 894 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) + 895 (kHexValueLenient[from[i * 2 + 1] & 0xFF]); 896 } 897 } 898 899 // This is a templated function so that T can be either a char* or a 900 // std::string. 901 template <typename T> 902 void BytesToHexStringInternal(absl::Nullable<const unsigned char*> src, T dest, 903 size_t num) { 904 auto dest_ptr = &dest[0]; 905 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) { 906 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2]; 907 std::copy(hex_p, hex_p + 2, dest_ptr); 908 } 909 } 910 911 } // namespace 912 913 // ---------------------------------------------------------------------- 914 // CUnescape() 915 // 916 // See CUnescapeInternal() for implementation details. 917 // ---------------------------------------------------------------------- 918 bool CUnescape(absl::string_view source, absl::Nonnull<std::string*> dest, 919 absl::Nullable<std::string*> error) { 920 return CUnescapeInternal(source, kUnescapeNulls, dest, error); 921 } 922 923 std::string CEscape(absl::string_view src) { 924 std::string dest; 925 CEscapeAndAppendInternal(src, &dest); 926 return dest; 927 } 928 929 std::string CHexEscape(absl::string_view src) { 930 return CEscapeInternal(src, true, false); 931 } 932 933 std::string Utf8SafeCEscape(absl::string_view src) { 934 return CEscapeInternal(src, false, true); 935 } 936 937 std::string Utf8SafeCHexEscape(absl::string_view src) { 938 return CEscapeInternal(src, true, true); 939 } 940 941 bool Base64Unescape(absl::string_view src, absl::Nonnull<std::string*> dest) { 942 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64); 943 } 944 945 bool WebSafeBase64Unescape(absl::string_view src, 946 absl::Nonnull<std::string*> dest) { 947 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64); 948 } 949 950 void Base64Escape(absl::string_view src, absl::Nonnull<std::string*> dest) { 951 strings_internal::Base64EscapeInternal( 952 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, 953 true, strings_internal::kBase64Chars); 954 } 955 956 void WebSafeBase64Escape(absl::string_view src, 957 absl::Nonnull<std::string*> dest) { 958 strings_internal::Base64EscapeInternal( 959 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, 960 false, strings_internal::kWebSafeBase64Chars); 961 } 962 963 std::string Base64Escape(absl::string_view src) { 964 std::string dest; 965 strings_internal::Base64EscapeInternal( 966 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest, 967 true, strings_internal::kBase64Chars); 968 return dest; 969 } 970 971 std::string WebSafeBase64Escape(absl::string_view src) { 972 std::string dest; 973 strings_internal::Base64EscapeInternal( 974 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest, 975 false, strings_internal::kWebSafeBase64Chars); 976 return dest; 977 } 978 979 bool HexStringToBytes(absl::string_view hex, 980 absl::Nonnull<std::string*> bytes) { 981 std::string output; 982 983 size_t num_bytes = hex.size() / 2; 984 if (hex.size() != num_bytes * 2) { 985 return false; 986 } 987 988 absl::strings_internal::STLStringResizeUninitialized(&output, num_bytes); 989 auto hex_p = hex.cbegin(); 990 for (std::string::iterator bin_p = output.begin(); bin_p != output.end(); 991 ++bin_p) { 992 int h1 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)]; 993 int h2 = absl::kHexValueStrict[static_cast<size_t>(*hex_p++)]; 994 if (h1 == -1 || h2 == -1) { 995 output.resize(static_cast<size_t>(bin_p - output.begin())); 996 return false; 997 } 998 *bin_p = static_cast<char>((h1 << 4) + h2); 999 } 1000 1001 *bytes = std::move(output); 1002 return true; 1003 } 1004 1005 std::string HexStringToBytes(absl::string_view from) { 1006 std::string result; 1007 const auto num = from.size() / 2; 1008 strings_internal::STLStringResizeUninitialized(&result, num); 1009 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num); 1010 return result; 1011 } 1012 1013 std::string BytesToHexString(absl::string_view from) { 1014 std::string result; 1015 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size()); 1016 absl::BytesToHexStringInternal<std::string&>( 1017 reinterpret_cast<const unsigned char*>(from.data()), result, from.size()); 1018 return result; 1019 } 1020 1021 ABSL_NAMESPACE_END 1022 } // namespace absl