[ tor ] .git.dasho

commit e231cd5b61afcb6640a7e17506bf33ddc6b1d2fe
parent 17a1ae025ac65d68bcfff2971fa6153daed7e220
Author: Nick Mathewson <nickm@torproject.org>
Date:   Tue,  7 Jan 2020 10:16:15 -0500

Merge branch 'ticket32845_squashed'

Diffstat:
A changes/ticket32845  | 4 ++++
M src/lib/string/util_string.c  | 21 +++++++++++++++++++--
M src/test/test_util.c  | 35 ++++++++++++++++++++++++++++++++++-

3 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/changes/ticket32845 b/changes/ticket32845
@@ -0,0 +1,4 @@
+  o Testing:
+    - Add more test cases for tor's UTF-8 validation function. Also, check the
+      arguments passed to the function for consistency.
+      Closes ticket 32845.
diff --git a/src/lib/string/util_string.c b/src/lib/string/util_string.c
@@ -506,6 +506,23 @@ validate_char(const uint8_t *c, uint8_t len)
 int
 string_is_utf8(const char *str, size_t len)
 {
+  // If str is NULL, don't try to read it
+  if (!str) {
+    // We could test for this case, but the low-level logs would produce
+    // confusing test output.
+    // LCOV_EXCL_START
+    if (len) {
+      // Use the low-level logging function, so that the log module can
+      // validate UTF-8 (if needed in future code)
+      tor_log_err_sigsafe(
+        "BUG: string_is_utf8() called with NULL str but non-zero len.");
+      // Since it's a bug, we should probably reject this string
+      return false;
+    }
+    // LCOV_EXCL_STOP
+    return true;
+  }
+
   for (size_t i = 0; i < len;) {
     uint8_t num_bytes = bytes_in_char(str[i]);
     if (num_bytes == 0) // Invalid leading byte found.
@@ -530,8 +547,8 @@ string_is_utf8(const char *str, size_t len)
 int
 string_is_utf8_no_bom(const char *str, size_t len)
 {
-  if (len >= 3 && (!strcmpstart(str, "\uFEFF") ||
-                   !strcmpstart(str, "\uFFFE"))) {
+  if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
+                          !strcmpstart(str, "\uFFFE"))) {
     return false;
   }
   return string_is_utf8(str, len);
diff --git a/src/test/test_util.c b/src/test/test_util.c
@@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr)
   tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
   tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
 
-  // The maximum legal codepoint, 10FFFF.
+  // The minimum legal codepoint, 0x00.
+  tt_int_op(1, OP_EQ, string_is_utf8("\0", 1));
+
+  // The maximum legal codepoint, 0x10FFFF.
   tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
   tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
 
+  /* Test cases that vary between programming languages /
+   * UTF-8 implementations.
+   * Source: POC||GTFO 19, page 43
+  * https://www.alchemistowl.org/pocorgtfo/
+   */
+
+  // Invalid (in most implementations)
+  // surrogate
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3));
+  // nullsurrog
+  tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5));
+  // threehigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
+  // fourhigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4));
+  // fivebyte
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5));
+  // sixbyte
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5));
+  // sixhigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5));
+
+  // Valid (in most implementations)
+  // fourbyte
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4));
+  // fourbyte2
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4));
+  // nullbyte
+  tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5));
+
  done:
   ;
 }

	tor The Tor anonymity network
	git clone https://git.dasho.dev/tor.git
	Log \| Files \| Refs \| README \| LICENSE

A	changes/ticket32845	\|	4	++++
M	src/lib/string/util_string.c	\|	21	+++++++++++++++++++--
M	src/test/test_util.c	\|	35	++++++++++++++++++++++++++++++++++-