util_string.c (13591B)
1 /* Copyright (c) 2003-2004, Roger Dingledine 2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson. 3 * Copyright (c) 2007-2021, The Tor Project, Inc. */ 4 /* See LICENSE for licensing information */ 5 6 /** 7 * \file util_string.c 8 * \brief Non-standard string functions used throughout Tor. 9 **/ 10 11 #include "lib/string/util_string.h" 12 #include "lib/string/compat_ctype.h" 13 #include "lib/err/torerr.h" 14 #include "lib/ctime/di_ops.h" 15 #include "lib/defs/digest_sizes.h" 16 17 #include <string.h> 18 #include <stdlib.h> 19 20 /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at 21 * <b>needle</b>, return a pointer to the first occurrence of the needle 22 * within the haystack, or NULL if there is no such occurrence. 23 * 24 * This function is <em>not</em> timing-safe. 25 * 26 * Requires that <b>nlen</b> be greater than zero. 27 */ 28 const void * 29 tor_memmem(const void *_haystack, size_t hlen, 30 const void *_needle, size_t nlen) 31 { 32 #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) 33 raw_assert(nlen); 34 if (nlen > hlen) 35 return NULL; 36 return memmem(_haystack, hlen, _needle, nlen); 37 #else 38 /* This isn't as fast as the GLIBC implementation, but it doesn't need to 39 * be. */ 40 const char *p, *last_possible_start; 41 const char *haystack = (const char*)_haystack; 42 const char *needle = (const char*)_needle; 43 char first; 44 raw_assert(nlen); 45 46 if (nlen > hlen) 47 return NULL; 48 49 p = haystack; 50 /* Last position at which the needle could start. */ 51 last_possible_start = haystack + hlen - nlen; 52 first = *(const char*)needle; 53 while ((p = memchr(p, first, last_possible_start + 1 - p))) { 54 if (fast_memeq(p, needle, nlen)) 55 return p; 56 if (++p > last_possible_start) { 57 /* This comparison shouldn't be necessary, since if p was previously 58 * equal to last_possible_start, the next memchr call would be 59 * "memchr(p, first, 0)", which will return NULL. But it clarifies the 60 * logic. */ 61 return NULL; 62 } 63 } 64 return NULL; 65 #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */ 66 } 67 68 const void * 69 tor_memstr(const void *haystack, size_t hlen, const char *needle) 70 { 71 return tor_memmem(haystack, hlen, needle, strlen(needle)); 72 } 73 74 /** Return true iff the 'len' bytes at 'mem' are all zero. */ 75 int 76 fast_mem_is_zero(const char *mem, size_t len) 77 { 78 static const char ZERO[] = { 79 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 80 }; 81 while (len >= sizeof(ZERO)) { 82 /* It's safe to use fast_memcmp here, since the very worst thing an 83 * attacker could learn is how many initial bytes of a secret were zero */ 84 if (fast_memcmp(mem, ZERO, sizeof(ZERO))) 85 return 0; 86 len -= sizeof(ZERO); 87 mem += sizeof(ZERO); 88 } 89 /* Deal with leftover bytes. */ 90 if (len) 91 return fast_memeq(mem, ZERO, len); 92 93 return 1; 94 } 95 96 /** Return true iff the DIGEST_LEN bytes in digest are all zero. */ 97 int 98 tor_digest_is_zero(const char *digest) 99 { 100 return safe_mem_is_zero(digest, DIGEST_LEN); 101 } 102 103 /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */ 104 int 105 tor_digest256_is_zero(const char *digest) 106 { 107 return safe_mem_is_zero(digest, DIGEST256_LEN); 108 } 109 110 /** Remove from the string <b>s</b> every character which appears in 111 * <b>strip</b>. */ 112 void 113 tor_strstrip(char *s, const char *strip) 114 { 115 char *readp = s; 116 while (*readp) { 117 if (strchr(strip, *readp)) { 118 ++readp; 119 } else { 120 *s++ = *readp++; 121 } 122 } 123 *s = '\0'; 124 } 125 126 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to 127 * lowercase. */ 128 void 129 tor_strlower(char *s) 130 { 131 while (*s) { 132 *s = TOR_TOLOWER(*s); 133 ++s; 134 } 135 } 136 137 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to 138 * lowercase. */ 139 void 140 tor_strupper(char *s) 141 { 142 while (*s) { 143 *s = TOR_TOUPPER(*s); 144 ++s; 145 } 146 } 147 148 /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */ 149 void 150 tor_strreplacechar(char *s, char find, char replacement) 151 { 152 for (s = strchr(s, find); s; s = strchr(s + 1, find)) { 153 *s = replacement; 154 } 155 } 156 157 /** Return 1 if every character in <b>s</b> is printable, else return 0. 158 */ 159 int 160 tor_strisprint(const char *s) 161 { 162 while (*s) { 163 if (!TOR_ISPRINT(*s)) 164 return 0; 165 s++; 166 } 167 return 1; 168 } 169 170 /** Return 1 if no character in <b>s</b> is uppercase, else return 0. 171 */ 172 int 173 tor_strisnonupper(const char *s) 174 { 175 while (*s) { 176 if (TOR_ISUPPER(*s)) 177 return 0; 178 s++; 179 } 180 return 1; 181 } 182 183 /** Return true iff every character in <b>s</b> is whitespace space; else 184 * return false. */ 185 int 186 tor_strisspace(const char *s) 187 { 188 while (*s) { 189 if (!TOR_ISSPACE(*s)) 190 return 0; 191 s++; 192 } 193 return 1; 194 } 195 196 /** As strcmp, except that either string may be NULL. The NULL string is 197 * considered to be before any non-NULL string. */ 198 int 199 strcmp_opt(const char *s1, const char *s2) 200 { 201 if (!s1) { 202 if (!s2) 203 return 0; 204 else 205 return -1; 206 } else if (!s2) { 207 return 1; 208 } else { 209 return strcmp(s1, s2); 210 } 211 } 212 213 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for 214 * strcmp. 215 */ 216 int 217 strcmpstart(const char *s1, const char *s2) 218 { 219 size_t n = strlen(s2); 220 return strncmp(s1, s2, n); 221 } 222 223 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for 224 * strcasecmp. 225 */ 226 int 227 strcasecmpstart(const char *s1, const char *s2) 228 { 229 size_t n = strlen(s2); 230 return strncasecmp(s1, s2, n); 231 } 232 233 /** Compare the value of the string <b>prefix</b> with the start of the 234 * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp. 235 * 236 * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is 237 * less than strlen(prefix).] 238 */ 239 int 240 fast_memcmpstart(const void *mem, size_t memlen, 241 const char *prefix) 242 { 243 size_t plen = strlen(prefix); 244 if (memlen < plen) 245 return -1; 246 return fast_memcmp(mem, prefix, plen); 247 } 248 249 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for 250 * strcmp. 251 */ 252 int 253 strcmpend(const char *s1, const char *s2) 254 { 255 size_t n1 = strlen(s1), n2 = strlen(s2); 256 if (n2>n1) 257 return strcmp(s1,s2); 258 else 259 return strncmp(s1+(n1-n2), s2, n2); 260 } 261 262 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for 263 * strcasecmp. 264 */ 265 int 266 strcasecmpend(const char *s1, const char *s2) 267 { 268 size_t n1 = strlen(s1), n2 = strlen(s2); 269 if (n2>n1) /* then they can't be the same; figure out which is bigger */ 270 return strcasecmp(s1,s2); 271 else 272 return strncasecmp(s1+(n1-n2), s2, n2); 273 } 274 275 /** Return a pointer to the first char of s that is not whitespace and 276 * not a comment, or to the terminating NUL if no such character exists. 277 */ 278 const char * 279 eat_whitespace(const char *s) 280 { 281 raw_assert(s); 282 283 while (1) { 284 switch (*s) { 285 case '\0': 286 default: 287 return s; 288 case ' ': 289 case '\t': 290 case '\n': 291 case '\r': 292 ++s; 293 break; 294 case '#': 295 ++s; 296 while (*s && *s != '\n') 297 ++s; 298 } 299 } 300 } 301 302 /** Return a pointer to the first char of s that is not whitespace and 303 * not a comment, or to the terminating NUL if no such character exists. 304 */ 305 const char * 306 eat_whitespace_eos(const char *s, const char *eos) 307 { 308 raw_assert(s); 309 raw_assert(eos && s <= eos); 310 311 while (s < eos) { 312 switch (*s) { 313 case '\0': 314 default: 315 return s; 316 case ' ': 317 case '\t': 318 case '\n': 319 case '\r': 320 ++s; 321 break; 322 case '#': 323 ++s; 324 while (s < eos && *s && *s != '\n') 325 ++s; 326 } 327 } 328 return s; 329 } 330 331 /** Return a pointer to the first char of s that is not a space or a tab 332 * or a \\r, or to the terminating NUL if no such character exists. */ 333 const char * 334 eat_whitespace_no_nl(const char *s) 335 { 336 while (*s == ' ' || *s == '\t' || *s == '\r') 337 ++s; 338 return s; 339 } 340 341 /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have 342 * found a non-whitespace character or not. */ 343 const char * 344 eat_whitespace_eos_no_nl(const char *s, const char *eos) 345 { 346 while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r')) 347 ++s; 348 return s; 349 } 350 351 /** Return a pointer to the first char of s that is whitespace or <b>#</b>, 352 * or to the terminating NUL if no such character exists. 353 */ 354 const char * 355 find_whitespace(const char *s) 356 { 357 /* tor_assert(s); */ 358 while (1) { 359 switch (*s) 360 { 361 case '\0': 362 case '#': 363 case ' ': 364 case '\r': 365 case '\n': 366 case '\t': 367 return s; 368 default: 369 ++s; 370 } 371 } 372 } 373 374 /** As find_whitespace, but stop at <b>eos</b> whether we have found a 375 * whitespace or not. */ 376 const char * 377 find_whitespace_eos(const char *s, const char *eos) 378 { 379 /* tor_assert(s); */ 380 while (s < eos) { 381 switch (*s) 382 { 383 case '\0': 384 case '#': 385 case ' ': 386 case '\r': 387 case '\n': 388 case '\t': 389 return s; 390 default: 391 ++s; 392 } 393 } 394 return s; 395 } 396 397 /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that 398 * occurs at the start of a line (that is, at the beginning of <b>haystack</b> 399 * or immediately after a newline). Return NULL if no such string is found. 400 */ 401 const char * 402 find_str_at_start_of_line(const char *haystack, const char *needle) 403 { 404 size_t needle_len = strlen(needle); 405 406 do { 407 if (!strncmp(haystack, needle, needle_len)) 408 return haystack; 409 410 haystack = strchr(haystack, '\n'); 411 if (!haystack) 412 return NULL; 413 else 414 ++haystack; 415 } while (*haystack); 416 417 return NULL; 418 } 419 420 /** Returns true if <b>string</b> could be a C identifier. 421 A C identifier must begin with a letter or an underscore and the 422 rest of its characters can be letters, numbers or underscores. No 423 length limit is imposed. */ 424 int 425 string_is_C_identifier(const char *string) 426 { 427 size_t iter; 428 size_t length = strlen(string); 429 if (!length) 430 return 0; 431 432 for (iter = 0; iter < length ; iter++) { 433 if (iter == 0) { 434 if (!(TOR_ISALPHA(string[iter]) || 435 string[iter] == '_')) 436 return 0; 437 } else { 438 if (!(TOR_ISALPHA(string[iter]) || 439 TOR_ISDIGIT(string[iter]) || 440 string[iter] == '_')) 441 return 0; 442 } 443 } 444 445 return 1; 446 } 447 448 /** A byte with the top <b>x</b> bits set. */ 449 #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x)))) 450 /** A byte with the lowest <b>x</b> bits set. */ 451 #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x)))) 452 453 /** Given the leading byte <b>b</b>, return the total number of bytes in the 454 * UTF-8 character. Returns 0 if it's an invalid leading byte. 455 */ 456 static uint8_t 457 bytes_in_char(uint8_t b) 458 { 459 if ((TOP_BITS(1) & b) == 0x00) 460 return 1; // a 1-byte UTF-8 char, aka ASCII 461 if ((TOP_BITS(3) & b) == TOP_BITS(2)) 462 return 2; // a 2-byte UTF-8 char 463 if ((TOP_BITS(4) & b) == TOP_BITS(3)) 464 return 3; // a 3-byte UTF-8 char 465 if ((TOP_BITS(5) & b) == TOP_BITS(4)) 466 return 4; // a 4-byte UTF-8 char 467 468 // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111. 469 return 0; 470 } 471 472 /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */ 473 static bool 474 is_continuation_byte(uint8_t b) 475 { 476 uint8_t top2bits = b & TOP_BITS(2); 477 return top2bits == TOP_BITS(1); 478 } 479 480 /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8 481 * character. 482 */ 483 static bool 484 validate_char(const uint8_t *c, uint8_t len) 485 { 486 if (len == 1) 487 return true; // already validated this is an ASCII char earlier. 488 489 uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte. 490 uint32_t codepoint = c[0] & mask; 491 492 mask = LOW_BITS(6); // bitmask for continuation bytes. 493 for (uint8_t i = 1; i < len; i++) { 494 if (!is_continuation_byte(c[i])) 495 return false; 496 codepoint <<= 6; 497 codepoint |= (c[i] & mask); 498 } 499 500 if (len == 2 && codepoint <= 0x7f) 501 return false; // Invalid, overly long encoding, should have fit in 1 byte. 502 503 if (len == 3 && codepoint <= 0x7ff) 504 return false; // Invalid, overly long encoding, should have fit in 2 bytes. 505 506 if (len == 4 && codepoint <= 0xffff) 507 return false; // Invalid, overly long encoding, should have fit in 3 bytes. 508 509 if (codepoint >= 0xd800 && codepoint <= 0xdfff) 510 return false; // Invalid, reserved for UTF-16 surrogate pairs. 511 512 return codepoint <= 0x10ffff; // Check if within maximum. 513 } 514 515 /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a 516 valid UTF-8 string. */ 517 int 518 string_is_utf8(const char *str, size_t len) 519 { 520 // If str is NULL, don't try to read it 521 if (!str) { 522 // We could test for this case, but the low-level logs would produce 523 // confusing test output. 524 // LCOV_EXCL_START 525 if (len) { 526 // Use the low-level logging function, so that the log module can 527 // validate UTF-8 (if needed in future code) 528 tor_log_err_sigsafe( 529 "BUG: string_is_utf8() called with NULL str but non-zero len."); 530 // Since it's a bug, we should probably reject this string 531 return false; 532 } 533 // LCOV_EXCL_STOP 534 return true; 535 } 536 537 for (size_t i = 0; i < len;) { 538 uint8_t num_bytes = bytes_in_char(str[i]); 539 if (num_bytes == 0) // Invalid leading byte found. 540 return false; 541 542 size_t next_char = i + num_bytes; 543 if (next_char > len) 544 return false; 545 546 // Validate the continuation bytes in this multi-byte character, 547 // and advance to the next character in the string. 548 if (!validate_char((const uint8_t*)&str[i], num_bytes)) 549 return false; 550 i = next_char; 551 } 552 return true; 553 } 554 555 /** As string_is_utf8(), but returns false if the string begins with a UTF-8 556 * byte order mark (BOM). 557 */ 558 int 559 string_is_utf8_no_bom(const char *str, size_t len) 560 { 561 if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") || 562 !strcmpstart(str, "\uFFFE"))) { 563 return false; 564 } 565 return string_is_utf8(str, len); 566 }