utf8_util.c (2275B)
1 /* Copyright 2013 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5 */ 6 7 /* Heuristics for deciding about the UTF8-ness of strings. */ 8 9 #include "utf8_util.h" 10 11 #include "../common/platform.h" 12 13 #if defined(__cplusplus) || defined(c_plusplus) 14 extern "C" { 15 #endif 16 17 static size_t BrotliParseAsUTF8( 18 int* symbol, const uint8_t* input, size_t size) { 19 /* ASCII */ 20 if ((input[0] & 0x80) == 0) { 21 *symbol = input[0]; 22 if (*symbol > 0) { 23 return 1; 24 } 25 } 26 /* 2-byte UTF8 */ 27 if (size > 1u && 28 (input[0] & 0xE0) == 0xC0 && 29 (input[1] & 0xC0) == 0x80) { 30 *symbol = (((input[0] & 0x1F) << 6) | 31 (input[1] & 0x3F)); 32 if (*symbol > 0x7F) { 33 return 2; 34 } 35 } 36 /* 3-byte UFT8 */ 37 if (size > 2u && 38 (input[0] & 0xF0) == 0xE0 && 39 (input[1] & 0xC0) == 0x80 && 40 (input[2] & 0xC0) == 0x80) { 41 *symbol = (((input[0] & 0x0F) << 12) | 42 ((input[1] & 0x3F) << 6) | 43 (input[2] & 0x3F)); 44 if (*symbol > 0x7FF) { 45 return 3; 46 } 47 } 48 /* 4-byte UFT8 */ 49 if (size > 3u && 50 (input[0] & 0xF8) == 0xF0 && 51 (input[1] & 0xC0) == 0x80 && 52 (input[2] & 0xC0) == 0x80 && 53 (input[3] & 0xC0) == 0x80) { 54 *symbol = (((input[0] & 0x07) << 18) | 55 ((input[1] & 0x3F) << 12) | 56 ((input[2] & 0x3F) << 6) | 57 (input[3] & 0x3F)); 58 if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) { 59 return 4; 60 } 61 } 62 /* Not UTF8, emit a special symbol above the UTF8-code space */ 63 *symbol = 0x110000 | input[0]; 64 return 1; 65 } 66 67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ 68 BROTLI_BOOL BrotliIsMostlyUTF8( 69 const uint8_t* data, const size_t pos, const size_t mask, 70 const size_t length, const double min_fraction) { 71 size_t size_utf8 = 0; 72 size_t i = 0; 73 while (i < length) { 74 int symbol; 75 size_t bytes_read = 76 BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); 77 i += bytes_read; 78 if (symbol < 0x110000) size_utf8 += bytes_read; 79 } 80 return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length); 81 } 82 83 #if defined(__cplusplus) || defined(c_plusplus) 84 } /* extern "C" */ 85 #endif