tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

util_string.c (13591B)


      1 /* Copyright (c) 2003-2004, Roger Dingledine
      2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
      3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
      4 /* See LICENSE for licensing information */
      5 
      6 /**
      7 * \file util_string.c
      8 * \brief Non-standard string functions used throughout Tor.
      9 **/
     10 
     11 #include "lib/string/util_string.h"
     12 #include "lib/string/compat_ctype.h"
     13 #include "lib/err/torerr.h"
     14 #include "lib/ctime/di_ops.h"
     15 #include "lib/defs/digest_sizes.h"
     16 
     17 #include <string.h>
     18 #include <stdlib.h>
     19 
     20 /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
     21 * <b>needle</b>, return a pointer to the first occurrence of the needle
     22 * within the haystack, or NULL if there is no such occurrence.
     23 *
     24 * This function is <em>not</em> timing-safe.
     25 *
     26 * Requires that <b>nlen</b> be greater than zero.
     27 */
     28 const void *
     29 tor_memmem(const void *_haystack, size_t hlen,
     30           const void *_needle, size_t nlen)
     31 {
     32 #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
     33  raw_assert(nlen);
     34  if (nlen > hlen)
     35    return NULL;
     36  return memmem(_haystack, hlen, _needle, nlen);
     37 #else
     38  /* This isn't as fast as the GLIBC implementation, but it doesn't need to
     39   * be. */
     40  const char *p, *last_possible_start;
     41  const char *haystack = (const char*)_haystack;
     42  const char *needle = (const char*)_needle;
     43  char first;
     44  raw_assert(nlen);
     45 
     46  if (nlen > hlen)
     47    return NULL;
     48 
     49  p = haystack;
     50  /* Last position at which the needle could start. */
     51  last_possible_start = haystack + hlen - nlen;
     52  first = *(const char*)needle;
     53  while ((p = memchr(p, first, last_possible_start + 1 - p))) {
     54    if (fast_memeq(p, needle, nlen))
     55      return p;
     56    if (++p > last_possible_start) {
     57      /* This comparison shouldn't be necessary, since if p was previously
     58       * equal to last_possible_start, the next memchr call would be
     59       * "memchr(p, first, 0)", which will return NULL. But it clarifies the
     60       * logic. */
     61      return NULL;
     62    }
     63  }
     64  return NULL;
     65 #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
     66 }
     67 
     68 const void *
     69 tor_memstr(const void *haystack, size_t hlen, const char *needle)
     70 {
     71  return tor_memmem(haystack, hlen, needle, strlen(needle));
     72 }
     73 
     74 /** Return true iff the 'len' bytes at 'mem' are all zero. */
     75 int
     76 fast_mem_is_zero(const char *mem, size_t len)
     77 {
     78  static const char ZERO[] = {
     79    0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
     80  };
     81  while (len >= sizeof(ZERO)) {
     82    /* It's safe to use fast_memcmp here, since the very worst thing an
     83     * attacker could learn is how many initial bytes of a secret were zero */
     84    if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
     85      return 0;
     86    len -= sizeof(ZERO);
     87    mem += sizeof(ZERO);
     88  }
     89  /* Deal with leftover bytes. */
     90  if (len)
     91    return fast_memeq(mem, ZERO, len);
     92 
     93  return 1;
     94 }
     95 
     96 /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
     97 int
     98 tor_digest_is_zero(const char *digest)
     99 {
    100  return safe_mem_is_zero(digest, DIGEST_LEN);
    101 }
    102 
    103 /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
    104 int
    105 tor_digest256_is_zero(const char *digest)
    106 {
    107  return safe_mem_is_zero(digest, DIGEST256_LEN);
    108 }
    109 
    110 /** Remove from the string <b>s</b> every character which appears in
    111 * <b>strip</b>. */
    112 void
    113 tor_strstrip(char *s, const char *strip)
    114 {
    115  char *readp = s;
    116  while (*readp) {
    117    if (strchr(strip, *readp)) {
    118      ++readp;
    119    } else {
    120      *s++ = *readp++;
    121    }
    122  }
    123  *s = '\0';
    124 }
    125 
    126 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
    127 * lowercase. */
    128 void
    129 tor_strlower(char *s)
    130 {
    131  while (*s) {
    132    *s = TOR_TOLOWER(*s);
    133    ++s;
    134  }
    135 }
    136 
    137 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
    138 * lowercase. */
    139 void
    140 tor_strupper(char *s)
    141 {
    142  while (*s) {
    143    *s = TOR_TOUPPER(*s);
    144    ++s;
    145  }
    146 }
    147 
    148 /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
    149 void
    150 tor_strreplacechar(char *s, char find, char replacement)
    151 {
    152  for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
    153    *s = replacement;
    154  }
    155 }
    156 
    157 /** Return 1 if every character in <b>s</b> is printable, else return 0.
    158 */
    159 int
    160 tor_strisprint(const char *s)
    161 {
    162  while (*s) {
    163    if (!TOR_ISPRINT(*s))
    164      return 0;
    165    s++;
    166  }
    167  return 1;
    168 }
    169 
    170 /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
    171 */
    172 int
    173 tor_strisnonupper(const char *s)
    174 {
    175  while (*s) {
    176    if (TOR_ISUPPER(*s))
    177      return 0;
    178    s++;
    179  }
    180  return 1;
    181 }
    182 
    183 /** Return true iff every character in <b>s</b> is whitespace space; else
    184 * return false. */
    185 int
    186 tor_strisspace(const char *s)
    187 {
    188  while (*s) {
    189    if (!TOR_ISSPACE(*s))
    190      return 0;
    191    s++;
    192  }
    193  return 1;
    194 }
    195 
    196 /** As strcmp, except that either string may be NULL.  The NULL string is
    197 * considered to be before any non-NULL string. */
    198 int
    199 strcmp_opt(const char *s1, const char *s2)
    200 {
    201  if (!s1) {
    202    if (!s2)
    203      return 0;
    204    else
    205      return -1;
    206  } else if (!s2) {
    207    return 1;
    208  } else {
    209    return strcmp(s1, s2);
    210  }
    211 }
    212 
    213 /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
    214 * strcmp.
    215 */
    216 int
    217 strcmpstart(const char *s1, const char *s2)
    218 {
    219  size_t n = strlen(s2);
    220  return strncmp(s1, s2, n);
    221 }
    222 
    223 /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
    224 * strcasecmp.
    225 */
    226 int
    227 strcasecmpstart(const char *s1, const char *s2)
    228 {
    229  size_t n = strlen(s2);
    230  return strncasecmp(s1, s2, n);
    231 }
    232 
    233 /** Compare the value of the string <b>prefix</b> with the start of the
    234 * <b>memlen</b>-byte memory chunk at <b>mem</b>.  Return as for strcmp.
    235 *
    236 * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
    237 * less than strlen(prefix).]
    238 */
    239 int
    240 fast_memcmpstart(const void *mem, size_t memlen,
    241                const char *prefix)
    242 {
    243  size_t plen = strlen(prefix);
    244  if (memlen < plen)
    245    return -1;
    246  return fast_memcmp(mem, prefix, plen);
    247 }
    248 
    249 /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
    250 * strcmp.
    251 */
    252 int
    253 strcmpend(const char *s1, const char *s2)
    254 {
    255  size_t n1 = strlen(s1), n2 = strlen(s2);
    256  if (n2>n1)
    257    return strcmp(s1,s2);
    258  else
    259    return strncmp(s1+(n1-n2), s2, n2);
    260 }
    261 
    262 /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
    263 * strcasecmp.
    264 */
    265 int
    266 strcasecmpend(const char *s1, const char *s2)
    267 {
    268  size_t n1 = strlen(s1), n2 = strlen(s2);
    269  if (n2>n1) /* then they can't be the same; figure out which is bigger */
    270    return strcasecmp(s1,s2);
    271  else
    272    return strncasecmp(s1+(n1-n2), s2, n2);
    273 }
    274 
    275 /** Return a pointer to the first char of s that is not whitespace and
    276 * not a comment, or to the terminating NUL if no such character exists.
    277 */
    278 const char *
    279 eat_whitespace(const char *s)
    280 {
    281  raw_assert(s);
    282 
    283  while (1) {
    284    switch (*s) {
    285    case '\0':
    286    default:
    287      return s;
    288    case ' ':
    289    case '\t':
    290    case '\n':
    291    case '\r':
    292      ++s;
    293      break;
    294    case '#':
    295      ++s;
    296      while (*s && *s != '\n')
    297        ++s;
    298    }
    299  }
    300 }
    301 
    302 /** Return a pointer to the first char of s that is not whitespace and
    303 * not a comment, or to the terminating NUL if no such character exists.
    304 */
    305 const char *
    306 eat_whitespace_eos(const char *s, const char *eos)
    307 {
    308  raw_assert(s);
    309  raw_assert(eos && s <= eos);
    310 
    311  while (s < eos) {
    312    switch (*s) {
    313    case '\0':
    314    default:
    315      return s;
    316    case ' ':
    317    case '\t':
    318    case '\n':
    319    case '\r':
    320      ++s;
    321      break;
    322    case '#':
    323      ++s;
    324      while (s < eos && *s && *s != '\n')
    325        ++s;
    326    }
    327  }
    328  return s;
    329 }
    330 
    331 /** Return a pointer to the first char of s that is not a space or a tab
    332 * or a \\r, or to the terminating NUL if no such character exists. */
    333 const char *
    334 eat_whitespace_no_nl(const char *s)
    335 {
    336  while (*s == ' ' || *s == '\t' || *s == '\r')
    337    ++s;
    338  return s;
    339 }
    340 
    341 /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
    342 * found a non-whitespace character or not. */
    343 const char *
    344 eat_whitespace_eos_no_nl(const char *s, const char *eos)
    345 {
    346  while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
    347    ++s;
    348  return s;
    349 }
    350 
    351 /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
    352 * or to the terminating NUL if no such character exists.
    353 */
    354 const char *
    355 find_whitespace(const char *s)
    356 {
    357  /* tor_assert(s); */
    358  while (1) {
    359    switch (*s)
    360    {
    361    case '\0':
    362    case '#':
    363    case ' ':
    364    case '\r':
    365    case '\n':
    366    case '\t':
    367      return s;
    368    default:
    369      ++s;
    370    }
    371  }
    372 }
    373 
    374 /** As find_whitespace, but stop at <b>eos</b> whether we have found a
    375 * whitespace or not. */
    376 const char *
    377 find_whitespace_eos(const char *s, const char *eos)
    378 {
    379  /* tor_assert(s); */
    380  while (s < eos) {
    381    switch (*s)
    382    {
    383    case '\0':
    384    case '#':
    385    case ' ':
    386    case '\r':
    387    case '\n':
    388    case '\t':
    389      return s;
    390    default:
    391      ++s;
    392    }
    393  }
    394  return s;
    395 }
    396 
    397 /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
    398 * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
    399 * or immediately after a newline).  Return NULL if no such string is found.
    400 */
    401 const char *
    402 find_str_at_start_of_line(const char *haystack, const char *needle)
    403 {
    404  size_t needle_len = strlen(needle);
    405 
    406  do {
    407    if (!strncmp(haystack, needle, needle_len))
    408      return haystack;
    409 
    410    haystack = strchr(haystack, '\n');
    411    if (!haystack)
    412      return NULL;
    413    else
    414      ++haystack;
    415  } while (*haystack);
    416 
    417  return NULL;
    418 }
    419 
    420 /** Returns true if <b>string</b> could be a C identifier.
    421    A C identifier must begin with a letter or an underscore and the
    422    rest of its characters can be letters, numbers or underscores. No
    423    length limit is imposed. */
    424 int
    425 string_is_C_identifier(const char *string)
    426 {
    427  size_t iter;
    428  size_t length = strlen(string);
    429  if (!length)
    430    return 0;
    431 
    432  for (iter = 0; iter < length ; iter++) {
    433    if (iter == 0) {
    434      if (!(TOR_ISALPHA(string[iter]) ||
    435            string[iter] == '_'))
    436        return 0;
    437    } else {
    438      if (!(TOR_ISALPHA(string[iter]) ||
    439            TOR_ISDIGIT(string[iter]) ||
    440            string[iter] == '_'))
    441        return 0;
    442    }
    443  }
    444 
    445  return 1;
    446 }
    447 
    448 /** A byte with the top <b>x</b> bits set. */
    449 #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
    450 /** A byte with the lowest <b>x</b> bits set. */
    451 #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
    452 
    453 /** Given the leading byte <b>b</b>, return the total number of bytes in the
    454 * UTF-8 character. Returns 0 if it's an invalid leading byte.
    455 */
    456 static uint8_t
    457 bytes_in_char(uint8_t b)
    458 {
    459  if ((TOP_BITS(1) & b) == 0x00)
    460    return 1; // a 1-byte UTF-8 char, aka ASCII
    461  if ((TOP_BITS(3) & b) == TOP_BITS(2))
    462    return 2; // a 2-byte UTF-8 char
    463  if ((TOP_BITS(4) & b) == TOP_BITS(3))
    464    return 3; // a 3-byte UTF-8 char
    465  if ((TOP_BITS(5) & b) == TOP_BITS(4))
    466    return 4; // a 4-byte UTF-8 char
    467 
    468  // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
    469  return 0;
    470 }
    471 
    472 /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
    473 static bool
    474 is_continuation_byte(uint8_t b)
    475 {
    476  uint8_t top2bits = b & TOP_BITS(2);
    477  return top2bits == TOP_BITS(1);
    478 }
    479 
    480 /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
    481 * character.
    482 */
    483 static bool
    484 validate_char(const uint8_t *c, uint8_t len)
    485 {
    486  if (len == 1)
    487    return true; // already validated this is an ASCII char earlier.
    488 
    489  uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
    490  uint32_t codepoint = c[0] & mask;
    491 
    492  mask = LOW_BITS(6); // bitmask for continuation bytes.
    493  for (uint8_t i = 1; i < len; i++) {
    494    if (!is_continuation_byte(c[i]))
    495      return false;
    496    codepoint <<= 6;
    497    codepoint |= (c[i] & mask);
    498  }
    499 
    500  if (len == 2 && codepoint <= 0x7f)
    501    return false; // Invalid, overly long encoding, should have fit in 1 byte.
    502 
    503  if (len == 3 && codepoint <= 0x7ff)
    504    return false; // Invalid, overly long encoding, should have fit in 2 bytes.
    505 
    506  if (len == 4 && codepoint <= 0xffff)
    507    return false; // Invalid, overly long encoding, should have fit in 3 bytes.
    508 
    509  if (codepoint >= 0xd800 && codepoint <= 0xdfff)
    510    return false; // Invalid, reserved for UTF-16 surrogate pairs.
    511 
    512  return codepoint <= 0x10ffff; // Check if within maximum.
    513 }
    514 
    515 /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
    516    valid UTF-8 string. */
    517 int
    518 string_is_utf8(const char *str, size_t len)
    519 {
    520  // If str is NULL, don't try to read it
    521  if (!str) {
    522    // We could test for this case, but the low-level logs would produce
    523    // confusing test output.
    524    // LCOV_EXCL_START
    525    if (len) {
    526      // Use the low-level logging function, so that the log module can
    527      // validate UTF-8 (if needed in future code)
    528      tor_log_err_sigsafe(
    529        "BUG: string_is_utf8() called with NULL str but non-zero len.");
    530      // Since it's a bug, we should probably reject this string
    531      return false;
    532    }
    533    // LCOV_EXCL_STOP
    534    return true;
    535  }
    536 
    537  for (size_t i = 0; i < len;) {
    538    uint8_t num_bytes = bytes_in_char(str[i]);
    539    if (num_bytes == 0) // Invalid leading byte found.
    540      return false;
    541 
    542    size_t next_char = i + num_bytes;
    543    if (next_char > len)
    544      return false;
    545 
    546    // Validate the continuation bytes in this multi-byte character,
    547    // and advance to the next character in the string.
    548    if (!validate_char((const uint8_t*)&str[i], num_bytes))
    549      return false;
    550    i = next_char;
    551  }
    552  return true;
    553 }
    554 
    555 /** As string_is_utf8(), but returns false if the string begins with a UTF-8
    556 * byte order mark (BOM).
    557 */
    558 int
    559 string_is_utf8_no_bom(const char *str, size_t len)
    560 {
    561  if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
    562                          !strcmpstart(str, "\uFFFE"))) {
    563    return false;
    564  }
    565  return string_is_utf8(str, len);
    566 }