utf8.h (32412B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utf8.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 1999sep13 16 * created by: Markus W. Scherer 17 */ 18 19 /** 20 * \file 21 * \brief C API: 8-bit Unicode handling macros 22 * 23 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. 24 * 25 * For more information see utf.h and the ICU User Guide Strings chapter 26 * (https://unicode-org.github.io/icu/userguide/strings). 27 * 28 * <em>Usage:</em> 29 * ICU coding guidelines for if() statements should be followed when using these macros. 30 * Compound statements (curly braces {}) must be used for if-else-while... 31 * bodies and all macro statements should be terminated with semicolon. 32 */ 33 34 #ifndef __UTF8_H__ 35 #define __UTF8_H__ 36 37 #include <stdbool.h> 38 #include "unicode/umachine.h" 39 #ifndef __UTF_H__ 40 # include "unicode/utf.h" 41 #endif 42 43 /* internal definitions ----------------------------------------------------- */ 44 45 /** 46 * Counts the trail bytes for a UTF-8 lead byte. 47 * Returns 0 for 0..0xc1 as well as for 0xf5..0xff. 48 * leadByte might be evaluated multiple times. 49 * 50 * This is internal since it is not meant to be called directly by external clients; 51 * however it is called by public macros in this file and thus must remain stable. 52 * 53 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 54 * @internal 55 */ 56 #define U8_COUNT_TRAIL_BYTES(leadByte) \ 57 (U8_IS_LEAD(leadByte) ? \ 58 ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0) 59 60 /** 61 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. 62 * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. 63 * leadByte might be evaluated multiple times. 64 * 65 * This is internal since it is not meant to be called directly by external clients; 66 * however it is called by public macros in this file and thus must remain stable. 67 * 68 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 69 * @internal 70 */ 71 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ 72 (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)) 73 74 /** 75 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 76 * 77 * This is internal since it is not meant to be called directly by external clients; 78 * however it is called by public macros in this file and thus must remain stable. 79 * @internal 80 */ 81 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 82 83 /** 84 * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. 85 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. 86 * Lead byte E0..EF bits 3..0 are used as byte index, 87 * first trail byte bits 7..5 are used as bit index into that byte. 88 * @see U8_IS_VALID_LEAD3_AND_T1 89 * @internal 90 */ 91 #define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" 92 93 /** 94 * Internal 3-byte UTF-8 validity check. 95 * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. 96 * @internal 97 */ 98 #define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) 99 100 /** 101 * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. 102 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. 103 * First trail byte bits 7..4 are used as byte index, 104 * lead byte F0..F4 bits 2..0 are used as bit index into that byte. 105 * @see U8_IS_VALID_LEAD4_AND_T1 106 * @internal 107 */ 108 #define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" 109 110 /** 111 * Internal 4-byte UTF-8 validity check. 112 * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. 113 * @internal 114 */ 115 #define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) 116 117 /** 118 * Function for handling "next code point" with error-checking. 119 * 120 * This is internal since it is not meant to be called directly by external clients; 121 * however it is called by public macros in this 122 * file and thus must remain stable, and should not be hidden when other internal 123 * functions are hidden (otherwise public macros would fail to compile). 124 * @internal 125 */ 126 U_CAPI UChar32 U_EXPORT2 127 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, int8_t strict); 128 129 /** 130 * Function for handling "append code point" with error-checking. 131 * 132 * This is internal since it is not meant to be called directly by external clients; 133 * however it is called by public macros in this 134 * file and thus must remain stable, and should not be hidden when other internal 135 * functions are hidden (otherwise public macros would fail to compile). 136 * @internal 137 */ 138 U_CAPI int32_t U_EXPORT2 139 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 140 141 /** 142 * Function for handling "previous code point" with error-checking. 143 * 144 * This is internal since it is not meant to be called directly by external clients; 145 * however it is called by public macros in this 146 * file and thus must remain stable, and should not be hidden when other internal 147 * functions are hidden (otherwise public macros would fail to compile). 148 * @internal 149 */ 150 U_CAPI UChar32 U_EXPORT2 151 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, int8_t strict); 152 153 /** 154 * Function for handling "skip backward one code point" with error-checking. 155 * 156 * This is internal since it is not meant to be called directly by external clients; 157 * however it is called by public macros in this 158 * file and thus must remain stable, and should not be hidden when other internal 159 * functions are hidden (otherwise public macros would fail to compile). 160 * @internal 161 */ 162 U_CAPI int32_t U_EXPORT2 163 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 164 165 /* single-code point definitions -------------------------------------------- */ 166 167 /** 168 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 169 * @param c 8-bit code unit (byte) 170 * @return true or false 171 * @stable ICU 2.4 172 */ 173 #define U8_IS_SINGLE(c) ((int8_t)(c)>=0) 174 175 /** 176 * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) 177 * @param c 8-bit code unit (byte) 178 * @return true or false 179 * @stable ICU 2.4 180 */ 181 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) 182 // 0x32=0xf4-0xc2 183 184 /** 185 * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) 186 * @param c 8-bit code unit (byte) 187 * @return true or false 188 * @stable ICU 2.4 189 */ 190 #define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40) 191 192 /** 193 * How many code units (bytes) are used for the UTF-8 encoding 194 * of this Unicode code point? 195 * @param c 32-bit code point 196 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 197 * @stable ICU 2.4 198 */ 199 #define U8_LENGTH(c) \ 200 ((uint32_t)(c)<=0x7f ? 1 : \ 201 ((uint32_t)(c)<=0x7ff ? 2 : \ 202 ((uint32_t)(c)<=0xd7ff ? 3 : \ 203 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 204 ((uint32_t)(c)<=0xffff ? 3 : 4)\ 205 ) \ 206 ) \ 207 ) \ 208 ) 209 210 /** 211 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 212 * @return 4 213 * @stable ICU 2.4 214 */ 215 #define U8_MAX_LENGTH 4 216 217 #ifndef U_HIDE_DRAFT_API 218 219 /** 220 * Returns the length of a well-formed UTF-8 byte sequence according to its lead byte. 221 * Returns 1 for 0..0xc1 as well as for 0xf5..0xff. 222 * leadByte might be evaluated multiple times. 223 * 224 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 225 * @return 1..4 226 * @draft ICU 78 227 */ 228 #define U8_LENGTH_FROM_LEAD_BYTE(leadByte) (U8_COUNT_TRAIL_BYTES(leadByte) + 1) 229 230 /** 231 * Returns the length of a well-formed UTF-8 byte sequence according to its lead byte. 232 * Returns 1 for 0..0xc1. Undefined for 0xf5..0xff. 233 * leadByte might be evaluated multiple times. 234 * 235 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 236 * @return 1..4 237 * @draft ICU 78 238 */ 239 #define U8_LENGTH_FROM_LEAD_BYTE_UNSAFE(leadByte) (U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) + 1) 240 241 #endif // U_HIDE_DRAFT_API 242 243 /** 244 * Get a code point from a string at a random-access offset, 245 * without changing the offset. 246 * The offset may point to either the lead byte or one of the trail bytes 247 * for a code point, in which case the macro will read all of the bytes 248 * for the code point. 249 * The result is undefined if the offset points to an illegal UTF-8 250 * byte sequence. 251 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 252 * 253 * @param s const uint8_t * string 254 * @param i string offset 255 * @param c output UChar32 variable 256 * @see U8_GET 257 * @stable ICU 2.4 258 */ 259 #define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 260 int32_t _u8_get_unsafe_index=(int32_t)(i); \ 261 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ 262 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ 263 } UPRV_BLOCK_MACRO_END 264 265 /** 266 * Get a code point from a string at a random-access offset, 267 * without changing the offset. 268 * The offset may point to either the lead byte or one of the trail bytes 269 * for a code point, in which case the macro will read all of the bytes 270 * for the code point. 271 * 272 * The length can be negative for a NUL-terminated string. 273 * 274 * If the offset points to an illegal UTF-8 byte sequence, then 275 * c is set to a negative value. 276 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 277 * 278 * @param s const uint8_t * string 279 * @param start int32_t starting string offset 280 * @param i int32_t string offset, must be start<=i<length 281 * @param length int32_t string length 282 * @param c output UChar32 variable, set to <0 in case of an error 283 * @see U8_GET_UNSAFE 284 * @stable ICU 2.4 285 */ 286 #define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ 287 int32_t _u8_get_index=(i); \ 288 U8_SET_CP_START(s, start, _u8_get_index); \ 289 U8_NEXT(s, _u8_get_index, length, c); \ 290 } UPRV_BLOCK_MACRO_END 291 292 /** 293 * Get a code point from a string at a random-access offset, 294 * without changing the offset. 295 * The offset may point to either the lead byte or one of the trail bytes 296 * for a code point, in which case the macro will read all of the bytes 297 * for the code point. 298 * 299 * The length can be negative for a NUL-terminated string. 300 * 301 * If the offset points to an illegal UTF-8 byte sequence, then 302 * c is set to U+FFFD. 303 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. 304 * 305 * This macro does not distinguish between a real U+FFFD in the text 306 * and U+FFFD returned for an ill-formed sequence. 307 * Use U8_GET() if that distinction is important. 308 * 309 * @param s const uint8_t * string 310 * @param start int32_t starting string offset 311 * @param i int32_t string offset, must be start<=i<length 312 * @param length int32_t string length 313 * @param c output UChar32 variable, set to U+FFFD in case of an error 314 * @see U8_GET 315 * @stable ICU 51 316 */ 317 #define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \ 318 int32_t _u8_get_index=(i); \ 319 U8_SET_CP_START(s, start, _u8_get_index); \ 320 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ 321 } UPRV_BLOCK_MACRO_END 322 323 /* definitions with forward iteration --------------------------------------- */ 324 325 /** 326 * Get a code point from a string at a code point boundary offset, 327 * and advance the offset to the next code point boundary. 328 * (Post-incrementing forward iteration.) 329 * "Unsafe" macro, assumes well-formed UTF-8. 330 * 331 * The offset may point to the lead byte of a multi-byte sequence, 332 * in which case the macro will read the whole sequence. 333 * The result is undefined if the offset points to a trail byte 334 * or an illegal UTF-8 sequence. 335 * 336 * @param s const uint8_t * string 337 * @param i string offset 338 * @param c output UChar32 variable 339 * @see U8_NEXT 340 * @stable ICU 2.4 341 */ 342 #define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 343 (c)=(uint8_t)(s)[(i)++]; \ 344 if(!U8_IS_SINGLE(c)) { \ 345 if((c)<0xe0) { \ 346 (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ 347 } else if((c)<0xf0) { \ 348 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 349 (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ 350 (i)+=2; \ 351 } else { \ 352 (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ 353 (i)+=3; \ 354 } \ 355 } \ 356 } UPRV_BLOCK_MACRO_END 357 358 /** 359 * Get a code point from a string at a code point boundary offset, 360 * and advance the offset to the next code point boundary. 361 * (Post-incrementing forward iteration.) 362 * "Safe" macro, checks for illegal sequences and for string boundaries. 363 * 364 * The length can be negative for a NUL-terminated string. 365 * 366 * The offset may point to the lead byte of a multi-byte sequence, 367 * in which case the macro will read the whole sequence. 368 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 369 * c is set to a negative value. 370 * 371 * @param s const uint8_t * string 372 * @param i int32_t string offset, must be i<length 373 * @param length int32_t string length 374 * @param c output UChar32 variable, set to <0 in case of an error 375 * @see U8_NEXT_UNSAFE 376 * @stable ICU 2.4 377 */ 378 #define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL) 379 380 /** 381 * Get a code point from a string at a code point boundary offset, 382 * and advance the offset to the next code point boundary. 383 * (Post-incrementing forward iteration.) 384 * "Safe" macro, checks for illegal sequences and for string boundaries. 385 * 386 * The length can be negative for a NUL-terminated string. 387 * 388 * The offset may point to the lead byte of a multi-byte sequence, 389 * in which case the macro will read the whole sequence. 390 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 391 * c is set to U+FFFD. 392 * 393 * This macro does not distinguish between a real U+FFFD in the text 394 * and U+FFFD returned for an ill-formed sequence. 395 * Use U8_NEXT() if that distinction is important. 396 * 397 * @param s const uint8_t * string 398 * @param i int32_t string offset, must be i<length 399 * @param length int32_t string length 400 * @param c output UChar32 variable, set to U+FFFD in case of an error 401 * @see U8_NEXT 402 * @stable ICU 51 403 */ 404 #define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd) 405 406 /** @internal */ 407 #define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \ 408 (c)=(uint8_t)(s)[(i)++]; \ 409 if(!U8_IS_SINGLE(c)) { \ 410 uint8_t __t = 0; \ 411 if((i)!=(length) && \ 412 /* fetch/validate/assemble all but last trail byte */ \ 413 ((c)>=0xe0 ? \ 414 ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ 415 U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ 416 (__t&=0x3f, 1) \ 417 : /* U+10000..U+10FFFF */ \ 418 ((c)-=0xf0)<=4 && \ 419 U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ 420 ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ 421 (__t=(s)[i]-0x80)<=0x3f) && \ 422 /* valid second-to-last trail byte */ \ 423 ((c)=((c)<<6)|__t, ++(i)!=(length)) \ 424 : /* U+0080..U+07FF */ \ 425 (c)>=0xc2 && ((c)&=0x1f, 1)) && \ 426 /* last trail byte */ \ 427 (__t=(s)[i]-0x80)<=0x3f && \ 428 ((c)=((c)<<6)|__t, ++(i), 1)) { \ 429 } else { \ 430 (c)=(sub); /* ill-formed*/ \ 431 } \ 432 } \ 433 } UPRV_BLOCK_MACRO_END 434 435 /** 436 * Append a code point to a string, overwriting 1 to 4 bytes. 437 * The offset points to the current end of the string contents 438 * and is advanced (post-increment). 439 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 440 * Otherwise, the result is undefined. 441 * 442 * @param s const uint8_t * string buffer 443 * @param i string offset 444 * @param c code point to append 445 * @see U8_APPEND 446 * @stable ICU 2.4 447 */ 448 #define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 449 uint32_t __uc=(c); \ 450 if(__uc<=0x7f) { \ 451 (s)[(i)++]=(uint8_t)__uc; \ 452 } else { \ 453 if(__uc<=0x7ff) { \ 454 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ 455 } else { \ 456 if(__uc<=0xffff) { \ 457 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ 458 } else { \ 459 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ 460 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ 461 } \ 462 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 463 } \ 464 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 465 } \ 466 } UPRV_BLOCK_MACRO_END 467 468 /** 469 * Append a code point to a string, overwriting 1 to 4 bytes. 470 * The offset points to the current end of the string contents 471 * and is advanced (post-increment). 472 * "Safe" macro, checks for a valid code point. 473 * If a non-ASCII code point is written, checks for sufficient space in the string. 474 * If the code point is not valid or trail bytes do not fit, 475 * then isError is set to true. 476 * 477 * @param s const uint8_t * string buffer 478 * @param i int32_t string offset, must be i<capacity 479 * @param capacity int32_t size of the string buffer 480 * @param c UChar32 code point to append 481 * @param isError output UBool set to true if an error occurs, otherwise not modified 482 * @see U8_APPEND_UNSAFE 483 * @stable ICU 2.4 484 */ 485 #define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \ 486 uint32_t __uc=(c); \ 487 if(__uc<=0x7f) { \ 488 (s)[(i)++]=(uint8_t)__uc; \ 489 } else if(__uc<=0x7ff && (i)+1<(capacity)) { \ 490 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ 491 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 492 } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \ 493 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ 494 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 495 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 496 } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \ 497 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ 498 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ 499 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ 500 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ 501 } else { \ 502 (isError)=true; \ 503 } \ 504 } UPRV_BLOCK_MACRO_END 505 506 /** 507 * Advance the string offset from one code point boundary to the next. 508 * (Post-incrementing iteration.) 509 * "Unsafe" macro, assumes well-formed UTF-8. 510 * 511 * @param s const uint8_t * string 512 * @param i string offset 513 * @see U8_FWD_1 514 * @stable ICU 2.4 515 */ 516 #define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 517 (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \ 518 } UPRV_BLOCK_MACRO_END 519 520 /** 521 * Advance the string offset from one code point boundary to the next. 522 * (Post-incrementing iteration.) 523 * "Safe" macro, checks for illegal sequences and for string boundaries. 524 * 525 * The length can be negative for a NUL-terminated string. 526 * 527 * @param s const uint8_t * string 528 * @param i int32_t string offset, must be i<length 529 * @param length int32_t string length 530 * @see U8_FWD_1_UNSAFE 531 * @stable ICU 2.4 532 */ 533 #define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \ 534 uint8_t __b=(s)[(i)++]; \ 535 if(U8_IS_LEAD(__b) && (i)!=(length)) { \ 536 uint8_t __t1=(s)[i]; \ 537 if((0xe0<=__b && __b<0xf0)) { \ 538 if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \ 539 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ 540 ++(i); \ 541 } \ 542 } else if(__b<0xe0) { \ 543 if(U8_IS_TRAIL(__t1)) { \ 544 ++(i); \ 545 } \ 546 } else /* b>=0xf0 */ { \ 547 if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \ 548 ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \ 549 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \ 550 ++(i); \ 551 } \ 552 } \ 553 } \ 554 } UPRV_BLOCK_MACRO_END 555 556 /** 557 * Advance the string offset from one code point boundary to the n-th next one, 558 * i.e., move forward by n code points. 559 * (Post-incrementing iteration.) 560 * "Unsafe" macro, assumes well-formed UTF-8. 561 * 562 * @param s const uint8_t * string 563 * @param i string offset 564 * @param n number of code points to skip 565 * @see U8_FWD_N 566 * @stable ICU 2.4 567 */ 568 #define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 569 int32_t __N=(n); \ 570 while(__N>0) { \ 571 U8_FWD_1_UNSAFE(s, i); \ 572 --__N; \ 573 } \ 574 } UPRV_BLOCK_MACRO_END 575 576 /** 577 * Advance the string offset from one code point boundary to the n-th next one, 578 * i.e., move forward by n code points. 579 * (Post-incrementing iteration.) 580 * "Safe" macro, checks for illegal sequences and for string boundaries. 581 * 582 * The length can be negative for a NUL-terminated string. 583 * 584 * @param s const uint8_t * string 585 * @param i int32_t string offset, must be i<length 586 * @param length int32_t string length 587 * @param n number of code points to skip 588 * @see U8_FWD_N_UNSAFE 589 * @stable ICU 2.4 590 */ 591 #define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \ 592 int32_t __N=(n); \ 593 while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 594 U8_FWD_1(s, i, length); \ 595 --__N; \ 596 } \ 597 } UPRV_BLOCK_MACRO_END 598 599 /** 600 * Adjust a random-access offset to a code point boundary 601 * at the start of a code point. 602 * If the offset points to a UTF-8 trail byte, 603 * then the offset is moved backward to the corresponding lead byte. 604 * Otherwise, it is not modified. 605 * "Unsafe" macro, assumes well-formed UTF-8. 606 * 607 * @param s const uint8_t * string 608 * @param i string offset 609 * @see U8_SET_CP_START 610 * @stable ICU 2.4 611 */ 612 #define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 613 while(U8_IS_TRAIL((s)[i])) { --(i); } \ 614 } UPRV_BLOCK_MACRO_END 615 616 /** 617 * Adjust a random-access offset to a code point boundary 618 * at the start of a code point. 619 * If the offset points to a UTF-8 trail byte, 620 * then the offset is moved backward to the corresponding lead byte. 621 * Otherwise, it is not modified. 622 * 623 * "Safe" macro, checks for illegal sequences and for string boundaries. 624 * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i]. 625 * 626 * @param s const uint8_t * string 627 * @param start int32_t starting string offset (usually 0) 628 * @param i int32_t string offset, must be start<=i 629 * @see U8_SET_CP_START_UNSAFE 630 * @see U8_TRUNCATE_IF_INCOMPLETE 631 * @stable ICU 2.4 632 */ 633 #define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ 634 if(U8_IS_TRAIL((s)[(i)])) { \ 635 (i)=utf8_back1SafeBody(s, start, (i)); \ 636 } \ 637 } UPRV_BLOCK_MACRO_END 638 639 /** 640 * If the string ends with a UTF-8 byte sequence that is valid so far 641 * but incomplete, then reduce the length of the string to end before 642 * the lead byte of that incomplete sequence. 643 * For example, if the string ends with E1 80, the length is reduced by 2. 644 * 645 * In all other cases (the string ends with a complete sequence, or it is not 646 * possible for any further trail byte to extend the trailing sequence) 647 * the length remains unchanged. 648 * 649 * Useful for processing text split across multiple buffers 650 * (save the incomplete sequence for later) 651 * and for optimizing iteration 652 * (check for string length only once per character). 653 * 654 * "Safe" macro, checks for illegal sequences and for string boundaries. 655 * Unlike U8_SET_CP_START(), this macro never reads s[length]. 656 * 657 * (In UTF-16, simply check for U16_IS_LEAD(last code unit).) 658 * 659 * @param s const uint8_t * string 660 * @param start int32_t starting string offset (usually 0) 661 * @param length int32_t string length (usually start<=length) 662 * @see U8_SET_CP_START 663 * @stable ICU 61 664 */ 665 #define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \ 666 if((length)>(start)) { \ 667 uint8_t __b1=s[(length)-1]; \ 668 if(U8_IS_SINGLE(__b1)) { \ 669 /* common ASCII character */ \ 670 } else if(U8_IS_LEAD(__b1)) { \ 671 --(length); \ 672 } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \ 673 uint8_t __b2=s[(length)-2]; \ 674 if(0xe0<=__b2 && __b2<=0xf4) { \ 675 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \ 676 U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \ 677 (length)-=2; \ 678 } \ 679 } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \ 680 uint8_t __b3=s[(length)-3]; \ 681 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \ 682 (length)-=3; \ 683 } \ 684 } \ 685 } \ 686 } \ 687 } UPRV_BLOCK_MACRO_END 688 689 /* definitions with backward iteration -------------------------------------- */ 690 691 /** 692 * Move the string offset from one code point boundary to the previous one 693 * and get the code point between them. 694 * (Pre-decrementing backward iteration.) 695 * "Unsafe" macro, assumes well-formed UTF-8. 696 * 697 * The input offset may be the same as the string length. 698 * If the offset is behind a multi-byte sequence, then the macro will read 699 * the whole sequence. 700 * If the offset is behind a lead byte, then that itself 701 * will be returned as the code point. 702 * The result is undefined if the offset is behind an illegal UTF-8 sequence. 703 * 704 * @param s const uint8_t * string 705 * @param i string offset 706 * @param c output UChar32 variable 707 * @see U8_PREV 708 * @stable ICU 2.4 709 */ 710 #define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 711 (c)=(uint8_t)(s)[--(i)]; \ 712 if(!U8_IS_SINGLE(c)) { \ 713 uint8_t __b, __count=1, __shift=6; \ 714 \ 715 /* c is a trail byte */ \ 716 (c)&=0x3f; \ 717 for(;;) { \ 718 __b=(s)[--(i)]; \ 719 if(__b>=0xc0) { \ 720 U8_MASK_LEAD_BYTE(__b, __count); \ 721 (c)|=(UChar32)__b<<__shift; \ 722 break; \ 723 } else { \ 724 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 725 ++__count; \ 726 __shift+=6; \ 727 } \ 728 } \ 729 } \ 730 } UPRV_BLOCK_MACRO_END 731 732 /** 733 * Move the string offset from one code point boundary to the previous one 734 * and get the code point between them. 735 * (Pre-decrementing backward iteration.) 736 * "Safe" macro, checks for illegal sequences and for string boundaries. 737 * 738 * The input offset may be the same as the string length. 739 * If the offset is behind a multi-byte sequence, then the macro will read 740 * the whole sequence. 741 * If the offset is behind a lead byte, then that itself 742 * will be returned as the code point. 743 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. 744 * 745 * @param s const uint8_t * string 746 * @param start int32_t starting string offset (usually 0) 747 * @param i int32_t string offset, must be start<i 748 * @param c output UChar32 variable, set to <0 in case of an error 749 * @see U8_PREV_UNSAFE 750 * @stable ICU 2.4 751 */ 752 #define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 753 (c)=(uint8_t)(s)[--(i)]; \ 754 if(!U8_IS_SINGLE(c)) { \ 755 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ 756 } \ 757 } UPRV_BLOCK_MACRO_END 758 759 /** 760 * Move the string offset from one code point boundary to the previous one 761 * and get the code point between them. 762 * (Pre-decrementing backward iteration.) 763 * "Safe" macro, checks for illegal sequences and for string boundaries. 764 * 765 * The input offset may be the same as the string length. 766 * If the offset is behind a multi-byte sequence, then the macro will read 767 * the whole sequence. 768 * If the offset is behind a lead byte, then that itself 769 * will be returned as the code point. 770 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. 771 * 772 * This macro does not distinguish between a real U+FFFD in the text 773 * and U+FFFD returned for an ill-formed sequence. 774 * Use U8_PREV() if that distinction is important. 775 * 776 * @param s const uint8_t * string 777 * @param start int32_t starting string offset (usually 0) 778 * @param i int32_t string offset, must be start<i 779 * @param c output UChar32 variable, set to U+FFFD in case of an error 780 * @see U8_PREV 781 * @stable ICU 51 782 */ 783 #define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \ 784 (c)=(uint8_t)(s)[--(i)]; \ 785 if(!U8_IS_SINGLE(c)) { \ 786 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ 787 } \ 788 } UPRV_BLOCK_MACRO_END 789 790 /** 791 * Move the string offset from one code point boundary to the previous one. 792 * (Pre-decrementing backward iteration.) 793 * The input offset may be the same as the string length. 794 * "Unsafe" macro, assumes well-formed UTF-8. 795 * 796 * @param s const uint8_t * string 797 * @param i string offset 798 * @see U8_BACK_1 799 * @stable ICU 2.4 800 */ 801 #define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 802 while(U8_IS_TRAIL((s)[--(i)])) {} \ 803 } UPRV_BLOCK_MACRO_END 804 805 /** 806 * Move the string offset from one code point boundary to the previous one. 807 * (Pre-decrementing backward iteration.) 808 * The input offset may be the same as the string length. 809 * "Safe" macro, checks for illegal sequences and for string boundaries. 810 * 811 * @param s const uint8_t * string 812 * @param start int32_t starting string offset (usually 0) 813 * @param i int32_t string offset, must be start<i 814 * @see U8_BACK_1_UNSAFE 815 * @stable ICU 2.4 816 */ 817 #define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \ 818 if(U8_IS_TRAIL((s)[--(i)])) { \ 819 (i)=utf8_back1SafeBody(s, start, (i)); \ 820 } \ 821 } UPRV_BLOCK_MACRO_END 822 823 /** 824 * Move the string offset from one code point boundary to the n-th one before it, 825 * i.e., move backward by n code points. 826 * (Pre-decrementing backward iteration.) 827 * The input offset may be the same as the string length. 828 * "Unsafe" macro, assumes well-formed UTF-8. 829 * 830 * @param s const uint8_t * string 831 * @param i string offset 832 * @param n number of code points to skip 833 * @see U8_BACK_N 834 * @stable ICU 2.4 835 */ 836 #define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 837 int32_t __N=(n); \ 838 while(__N>0) { \ 839 U8_BACK_1_UNSAFE(s, i); \ 840 --__N; \ 841 } \ 842 } UPRV_BLOCK_MACRO_END 843 844 /** 845 * Move the string offset from one code point boundary to the n-th one before it, 846 * i.e., move backward by n code points. 847 * (Pre-decrementing backward iteration.) 848 * The input offset may be the same as the string length. 849 * "Safe" macro, checks for illegal sequences and for string boundaries. 850 * 851 * @param s const uint8_t * string 852 * @param start int32_t index of the start of the string 853 * @param i int32_t string offset, must be start<i 854 * @param n number of code points to skip 855 * @see U8_BACK_N_UNSAFE 856 * @stable ICU 2.4 857 */ 858 #define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \ 859 int32_t __N=(n); \ 860 while(__N>0 && (i)>(start)) { \ 861 U8_BACK_1(s, start, i); \ 862 --__N; \ 863 } \ 864 } UPRV_BLOCK_MACRO_END 865 866 /** 867 * Adjust a random-access offset to a code point boundary after a code point. 868 * If the offset is behind a partial multi-byte sequence, 869 * then the offset is incremented to behind the whole sequence. 870 * Otherwise, it is not modified. 871 * The input offset may be the same as the string length. 872 * "Unsafe" macro, assumes well-formed UTF-8. 873 * 874 * @param s const uint8_t * string 875 * @param i string offset 876 * @see U8_SET_CP_LIMIT 877 * @stable ICU 2.4 878 */ 879 #define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \ 880 U8_BACK_1_UNSAFE(s, i); \ 881 U8_FWD_1_UNSAFE(s, i); \ 882 } UPRV_BLOCK_MACRO_END 883 884 /** 885 * Adjust a random-access offset to a code point boundary after a code point. 886 * If the offset is behind a partial multi-byte sequence, 887 * then the offset is incremented to behind the whole sequence. 888 * Otherwise, it is not modified. 889 * The input offset may be the same as the string length. 890 * "Safe" macro, checks for illegal sequences and for string boundaries. 891 * 892 * The length can be negative for a NUL-terminated string. 893 * 894 * @param s const uint8_t * string 895 * @param start int32_t starting string offset (usually 0) 896 * @param i int32_t string offset, must be start<=i<=length 897 * @param length int32_t string length 898 * @see U8_SET_CP_LIMIT_UNSAFE 899 * @stable ICU 2.4 900 */ 901 #define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \ 902 if((start)<(i) && ((i)<(length) || (length)<0)) { \ 903 U8_BACK_1(s, start, i); \ 904 U8_FWD_1(s, i, length); \ 905 } \ 906 } UPRV_BLOCK_MACRO_END 907 908 #endif