utf_impl.cpp (11607B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 1999-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: utf_impl.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 1999sep13 16 * created by: Markus W. Scherer 17 * 18 * This file provides implementation functions for macros in the utfXX.h 19 * that would otherwise be too long as macros. 20 */ 21 22 /* set import/export definitions */ 23 #ifndef U_UTF8_IMPL 24 # define U_UTF8_IMPL 25 #endif 26 27 #include "unicode/utypes.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf8.h" 30 #include "uassert.h" 31 32 /* 33 * Table of the number of utf8 trail bytes, indexed by the lead byte. 34 * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h 35 * 36 * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table. 37 * 38 * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were 39 * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES 40 * may exist in old client code that must continue to run with newer icu library versions. 41 * 42 * This table could be replaced on many machines by 43 * a few lines of assembler code using an 44 * "index of first 0-bit from msb" instruction and 45 * one or two more integer instructions. 46 * 47 * For example, on an i386, do something like 48 * - MOV AL, leadByte 49 * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0) 50 * - MOV AH, 0 51 * - BSR BX, AX (16-bit) 52 * - MOV AX, 6 (result) 53 * - JZ finish (ZF==1 if leadByte==0xff) 54 * - SUB AX, BX (result) 55 * -finish: 56 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) 57 */ 58 U_CAPI const uint8_t 59 utf8_countTrailBytes[256]={ 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64 65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 75 // illegal C0 & C1 76 // 2-byte lead bytes C2..DF 77 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 79 80 // 3-byte lead bytes E0..EF 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 // 4-byte lead bytes F0..F4 83 // illegal F5..FF 84 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 85 }; 86 87 static const UChar32 88 utf8_errorValue[6]={ 89 // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 90 // but without relying on the obsolete unicode/utf_old.h. 91 0x15, 0x9f, 0xffff, 92 0x10ffff 93 }; 94 95 static UChar32 96 errorValue(int32_t count, int8_t strict) { 97 if(strict>=0) { 98 return utf8_errorValue[count]; 99 } else if(strict==-3) { 100 return 0xfffd; 101 } else { 102 return U_SENTINEL; 103 } 104 } 105 106 /* 107 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros 108 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). 109 * 110 * U8_NEXT() supports NUL-terminated strings indicated via length<0. 111 * 112 * The "strict" parameter controls the error behavior: 113 * <0 "Safe" behavior of U8_NEXT(): 114 * -1: All illegal byte sequences yield U_SENTINEL=-1. 115 * -2: Same as -1, except for lenient treatment of surrogate code points as legal. 116 * Some implementations use this for roundtripping of 117 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they 118 * contain unpaired surrogates. 119 * -3: All illegal byte sequences yield U+FFFD. 120 * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., false): 121 * All illegal byte sequences yield a positive code point such that this 122 * result code point would be encoded with the same number of bytes as 123 * the illegal sequence. 124 * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., true): 125 * Same as the obsolete "safe" behavior, but non-characters are also treated 126 * like illegal sequences. 127 */ 128 U_CAPI UChar32 U_EXPORT2 129 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, int8_t strict) { 130 // *pi is one after byte c. 131 int32_t i=*pi; 132 // length can be negative for NUL-terminated strings: Read and validate one byte at a time. 133 if(i==length || c>0xf4) { 134 // end of string, or not a lead byte 135 } else if(c>=0xf0) { 136 // Test for 4-byte sequences first because 137 // U8_NEXT() handles shorter valid sequences inline. 138 uint8_t t1=s[i], t2, t3; 139 c&=7; 140 if(U8_IS_VALID_LEAD4_AND_T1(c, t1) && 141 ++i!=length && (t2=s[i]-0x80)<=0x3f && 142 ++i!=length && (t3=s[i]-0x80)<=0x3f) { 143 ++i; 144 c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3; 145 // strict: forbid non-characters like U+fffe 146 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { 147 *pi=i; 148 return c; 149 } 150 } 151 } else if(c>=0xe0) { 152 c&=0xf; 153 if(strict!=-2) { 154 uint8_t t1=s[i], t2; 155 if(U8_IS_VALID_LEAD3_AND_T1(c, t1) && 156 ++i!=length && (t2=s[i]-0x80)<=0x3f) { 157 ++i; 158 c=(c<<12)|((t1&0x3f)<<6)|t2; 159 // strict: forbid non-characters like U+fffe 160 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { 161 *pi=i; 162 return c; 163 } 164 } 165 } else { 166 // strict=-2 -> lenient: allow surrogates 167 uint8_t t1=s[i]-0x80, t2; 168 if(t1<=0x3f && (c>0 || t1>=0x20) && 169 ++i!=length && (t2=s[i]-0x80)<=0x3f) { 170 *pi=i+1; 171 return (c<<12)|(t1<<6)|t2; 172 } 173 } 174 } else if(c>=0xc2) { 175 uint8_t t1=s[i]-0x80; 176 if(t1<=0x3f) { 177 *pi=i+1; 178 return ((c-0xc0)<<6)|t1; 179 } 180 } // else 0x80<=c<0xc2 is not a lead byte 181 182 /* error handling */ 183 c=errorValue(i-*pi, strict); 184 *pi=i; 185 return c; 186 } 187 188 U_CAPI int32_t U_EXPORT2 189 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { 190 if((uint32_t)(c)<=0x7ff) { 191 if((i)+1<(length)) { 192 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); 193 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 194 return i; 195 } 196 } else if((uint32_t)(c)<=0xffff) { 197 /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */ 198 if((i)+2<(length) && !U_IS_SURROGATE(c)) { 199 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); 200 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 201 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 202 return i; 203 } 204 } else if((uint32_t)(c)<=0x10ffff) { 205 if((i)+3<(length)) { 206 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); 207 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); 208 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 209 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 210 return i; 211 } 212 } 213 /* c>0x10ffff or not enough space, write an error value */ 214 if(pIsError!=nullptr) { 215 *pIsError=true; 216 } else { 217 length-=i; 218 if(length>0) { 219 int32_t offset; 220 if(length>3) { 221 length=3; 222 } 223 s+=i; 224 offset=0; 225 c=utf8_errorValue[length-1]; 226 U8_APPEND_UNSAFE(s, offset, c); 227 i=i+offset; 228 } 229 } 230 return i; 231 } 232 233 U_CAPI UChar32 U_EXPORT2 234 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, int8_t strict) { 235 // *pi is the index of byte c. 236 int32_t i=*pi; 237 if(U8_IS_TRAIL(c) && i>start) { 238 uint8_t b1=s[--i]; 239 if(U8_IS_LEAD(b1)) { 240 if(b1<0xe0) { 241 *pi=i; 242 return ((b1-0xc0)<<6)|(c&0x3f); 243 } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) { 244 // Truncated 3- or 4-byte sequence. 245 *pi=i; 246 return errorValue(1, strict); 247 } 248 } else if(U8_IS_TRAIL(b1) && i>start) { 249 // Extract the value bits from the last trail byte. 250 c&=0x3f; 251 uint8_t b2=s[--i]; 252 if(0xe0<=b2 && b2<=0xf4) { 253 if(b2<0xf0) { 254 b2&=0xf; 255 if(strict!=-2) { 256 if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 257 *pi=i; 258 c=(b2<<12)|((b1&0x3f)<<6)|c; 259 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { 260 return c; 261 } else { 262 // strict: forbid non-characters like U+fffe 263 return errorValue(2, strict); 264 } 265 } 266 } else { 267 // strict=-2 -> lenient: allow surrogates 268 b1-=0x80; 269 if((b2>0 || b1>=0x20)) { 270 *pi=i; 271 return (b2<<12)|(b1<<6)|c; 272 } 273 } 274 } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { 275 // Truncated 4-byte sequence. 276 *pi=i; 277 return errorValue(2, strict); 278 } 279 } else if(U8_IS_TRAIL(b2) && i>start) { 280 uint8_t b3=s[--i]; 281 if(0xf0<=b3 && b3<=0xf4) { 282 b3&=7; 283 if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { 284 *pi=i; 285 c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c; 286 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { 287 return c; 288 } else { 289 // strict: forbid non-characters like U+fffe 290 return errorValue(3, strict); 291 } 292 } 293 } 294 } 295 } 296 } 297 return errorValue(0, strict); 298 } 299 300 U_CAPI int32_t U_EXPORT2 301 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { 302 // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points. 303 int32_t orig_i=i; 304 uint8_t c=s[i]; 305 if(U8_IS_TRAIL(c) && i>start) { 306 uint8_t b1=s[--i]; 307 if(U8_IS_LEAD(b1)) { 308 if(b1<0xe0 || 309 (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) { 310 return i; 311 } 312 } else if(U8_IS_TRAIL(b1) && i>start) { 313 uint8_t b2=s[--i]; 314 if(0xe0<=b2 && b2<=0xf4) { 315 if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { 316 return i; 317 } 318 } else if(U8_IS_TRAIL(b2) && i>start) { 319 uint8_t b3=s[--i]; 320 if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { 321 return i; 322 } 323 } 324 } 325 } 326 return orig_i; 327 }