tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

commit e231cd5b61afcb6640a7e17506bf33ddc6b1d2fe
parent 17a1ae025ac65d68bcfff2971fa6153daed7e220
Author: Nick Mathewson <nickm@torproject.org>
Date:   Tue,  7 Jan 2020 10:16:15 -0500

Merge branch 'ticket32845_squashed'

Diffstat:
Achanges/ticket32845 | 4++++
Msrc/lib/string/util_string.c | 21+++++++++++++++++++--
Msrc/test/test_util.c | 35++++++++++++++++++++++++++++++++++-
3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/changes/ticket32845 b/changes/ticket32845 @@ -0,0 +1,4 @@ + o Testing: + - Add more test cases for tor's UTF-8 validation function. Also, check the + arguments passed to the function for consistency. + Closes ticket 32845. diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c @@ -506,6 +506,23 @@ validate_char(const uint8_t *c, uint8_t len) int string_is_utf8(const char *str, size_t len) { + // If str is NULL, don't try to read it + if (!str) { + // We could test for this case, but the low-level logs would produce + // confusing test output. + // LCOV_EXCL_START + if (len) { + // Use the low-level logging function, so that the log module can + // validate UTF-8 (if needed in future code) + tor_log_err_sigsafe( + "BUG: string_is_utf8() called with NULL str but non-zero len."); + // Since it's a bug, we should probably reject this string + return false; + } + // LCOV_EXCL_STOP + return true; + } + for (size_t i = 0; i < len;) { uint8_t num_bytes = bytes_in_char(str[i]); if (num_bytes == 0) // Invalid leading byte found. @@ -530,8 +547,8 @@ string_is_utf8(const char *str, size_t len) int string_is_utf8_no_bom(const char *str, size_t len) { - if (len >= 3 && (!strcmpstart(str, "\uFEFF") || - !strcmpstart(str, "\uFFFE"))) { + if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") || + !strcmpstart(str, "\uFFFE"))) { return false; } return string_is_utf8(str, len); diff --git a/src/test/test_util.c b/src/test/test_util.c @@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr) tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3)); tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3)); - // The maximum legal codepoint, 10FFFF. + // The minimum legal codepoint, 0x00. + tt_int_op(1, OP_EQ, string_is_utf8("\0", 1)); + + // The maximum legal codepoint, 0x10FFFF. tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4)); tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4)); + /* Test cases that vary between programming languages / + * UTF-8 implementations. + * Source: POC||GTFO 19, page 43 +​ * https://www.alchemistowl.org/pocorgtfo/ + */ + + // Invalid (in most implementations) + // surrogate + tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3)); + // nullsurrog + tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5)); + // threehigh + tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3)); + // fourhigh + tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4)); + // fivebyte + tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5)); + // sixbyte + tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5)); + // sixhigh + tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5)); + + // Valid (in most implementations) + // fourbyte + tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4)); + // fourbyte2 + tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4)); + // nullbyte + tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5)); + done: ; }