tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utf8_util.c (2275B)


      1 /* Copyright 2013 Google Inc. All Rights Reserved.
      2 
      3   Distributed under MIT license.
      4   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* Heuristics for deciding about the UTF8-ness of strings. */
      8 
      9 #include "utf8_util.h"
     10 
     11 #include "../common/platform.h"
     12 
     13 #if defined(__cplusplus) || defined(c_plusplus)
     14 extern "C" {
     15 #endif
     16 
     17 static size_t BrotliParseAsUTF8(
     18    int* symbol, const uint8_t* input, size_t size) {
     19  /* ASCII */
     20  if ((input[0] & 0x80) == 0) {
     21    *symbol = input[0];
     22    if (*symbol > 0) {
     23      return 1;
     24    }
     25  }
     26  /* 2-byte UTF8 */
     27  if (size > 1u &&
     28      (input[0] & 0xE0) == 0xC0 &&
     29      (input[1] & 0xC0) == 0x80) {
     30    *symbol = (((input[0] & 0x1F) << 6) |
     31               (input[1] & 0x3F));
     32    if (*symbol > 0x7F) {
     33      return 2;
     34    }
     35  }
     36  /* 3-byte UFT8 */
     37  if (size > 2u &&
     38      (input[0] & 0xF0) == 0xE0 &&
     39      (input[1] & 0xC0) == 0x80 &&
     40      (input[2] & 0xC0) == 0x80) {
     41    *symbol = (((input[0] & 0x0F) << 12) |
     42               ((input[1] & 0x3F) << 6) |
     43               (input[2] & 0x3F));
     44    if (*symbol > 0x7FF) {
     45      return 3;
     46    }
     47  }
     48  /* 4-byte UFT8 */
     49  if (size > 3u &&
     50      (input[0] & 0xF8) == 0xF0 &&
     51      (input[1] & 0xC0) == 0x80 &&
     52      (input[2] & 0xC0) == 0x80 &&
     53      (input[3] & 0xC0) == 0x80) {
     54    *symbol = (((input[0] & 0x07) << 18) |
     55               ((input[1] & 0x3F) << 12) |
     56               ((input[2] & 0x3F) << 6) |
     57               (input[3] & 0x3F));
     58    if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
     59      return 4;
     60    }
     61  }
     62  /* Not UTF8, emit a special symbol above the UTF8-code space */
     63  *symbol = 0x110000 | input[0];
     64  return 1;
     65 }
     66 
     67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
     68 BROTLI_BOOL BrotliIsMostlyUTF8(
     69    const uint8_t* data, const size_t pos, const size_t mask,
     70    const size_t length, const double min_fraction) {
     71  size_t size_utf8 = 0;
     72  size_t i = 0;
     73  while (i < length) {
     74    int symbol;
     75    size_t bytes_read =
     76        BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
     77    i += bytes_read;
     78    if (symbol < 0x110000) size_utf8 += bytes_read;
     79  }
     80  return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
     81 }
     82 
     83 #if defined(__cplusplus) || defined(c_plusplus)
     84 }  /* extern "C" */
     85 #endif