ucnv_u8.cpp (31015B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv_u8.c 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jul01 14 * created by: Markus W. Scherer 15 * 16 * UTF-8 converter implementation. Used to be in ucnv_utf.c. 17 * 18 * Also, CESU-8 implementation, see UTR 26. 19 * The CESU-8 converter uses all the same functions as the 20 * UTF-8 converter, with a branch for converting supplementary code points. 21 */ 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_CONVERSION 26 27 #include "unicode/ucnv.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf8.h" 30 #include "unicode/utf16.h" 31 #include "uassert.h" 32 #include "ucnv_bld.h" 33 #include "ucnv_cnv.h" 34 #include "cmemory.h" 35 #include "ustr_imp.h" 36 37 /* Prototypes --------------------------------------------------------------- */ 38 39 /* Keep these here to make finicky compilers happy */ 40 41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, 42 UErrorCode *err); 43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, 44 UErrorCode *err); 45 46 47 /* UTF-8 -------------------------------------------------------------------- */ 48 49 #define MAXIMUM_UCS2 0x0000FFFF 50 51 static const uint32_t offsetsFromUTF8[5] = {0, 52 static_cast<uint32_t>(0x00000000), static_cast<uint32_t>(0x00003080), 53 static_cast<uint32_t>(0x000E2080), static_cast<uint32_t>(0x03C82080) 54 }; 55 56 static UBool hasCESU8Data(const UConverter *cnv) 57 { 58 #if UCONFIG_ONLY_HTML_CONVERSION 59 return false; 60 #else 61 return cnv->sharedData == &_CESU8Data; 62 #endif 63 } 64 U_CDECL_BEGIN 65 static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, 66 UErrorCode * err) 67 { 68 UConverter *cnv = args->converter; 69 const unsigned char *mySource = (unsigned char *) args->source; 70 char16_t *myTarget = args->target; 71 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 72 const char16_t *targetLimit = args->targetLimit; 73 unsigned char *toUBytes = cnv->toUBytes; 74 UBool isCESU8 = hasCESU8Data(cnv); 75 uint32_t ch, ch2 = 0; 76 int32_t i, inBytes; 77 78 /* Restore size of current sequence */ 79 if (cnv->toULength > 0 && myTarget < targetLimit) 80 { 81 inBytes = cnv->mode; /* restore # of bytes to consume */ 82 i = cnv->toULength; /* restore # of bytes consumed */ 83 cnv->toULength = 0; 84 85 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 86 cnv->toUnicodeStatus = 0; 87 goto morebytes; 88 } 89 90 91 while (mySource < sourceLimit && myTarget < targetLimit) 92 { 93 ch = *(mySource++); 94 if (U8_IS_SINGLE(ch)) /* Simple case */ 95 { 96 *(myTarget++) = (char16_t) ch; 97 } 98 else 99 { 100 /* store the first char */ 101 toUBytes[0] = (char)ch; 102 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */ 103 i = 1; 104 105 morebytes: 106 while (i < inBytes) 107 { 108 if (mySource < sourceLimit) 109 { 110 toUBytes[i] = (char) (ch2 = *mySource); 111 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && 112 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) 113 { 114 break; /* i < inBytes */ 115 } 116 ch = (ch << 6) + ch2; 117 ++mySource; 118 i++; 119 } 120 else 121 { 122 /* stores a partially calculated target*/ 123 cnv->toUnicodeStatus = ch; 124 cnv->mode = inBytes; 125 cnv->toULength = (int8_t) i; 126 goto donefornow; 127 } 128 } 129 130 // In CESU-8, only surrogates, not supplementary code points, are encoded directly. 131 if (i == inBytes && (!isCESU8 || i <= 3)) 132 { 133 /* Remove the accumulated high bits */ 134 ch -= offsetsFromUTF8[inBytes]; 135 136 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 137 if (ch <= MAXIMUM_UCS2) 138 { 139 /* fits in 16 bits */ 140 *(myTarget++) = (char16_t) ch; 141 } 142 else 143 { 144 /* write out the surrogates */ 145 *(myTarget++) = U16_LEAD(ch); 146 ch = U16_TRAIL(ch); 147 if (myTarget < targetLimit) 148 { 149 *(myTarget++) = (char16_t)ch; 150 } 151 else 152 { 153 /* Put in overflow buffer (not handled here) */ 154 cnv->UCharErrorBuffer[0] = (char16_t) ch; 155 cnv->UCharErrorBufferLength = 1; 156 *err = U_BUFFER_OVERFLOW_ERROR; 157 break; 158 } 159 } 160 } 161 else 162 { 163 cnv->toULength = (int8_t)i; 164 *err = U_ILLEGAL_CHAR_FOUND; 165 break; 166 } 167 } 168 } 169 170 donefornow: 171 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 172 { 173 /* End of target buffer */ 174 *err = U_BUFFER_OVERFLOW_ERROR; 175 } 176 177 args->target = myTarget; 178 args->source = (const char *) mySource; 179 } 180 181 static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, 182 UErrorCode * err) 183 { 184 UConverter *cnv = args->converter; 185 const unsigned char *mySource = (unsigned char *) args->source; 186 char16_t *myTarget = args->target; 187 int32_t *myOffsets = args->offsets; 188 int32_t offsetNum = 0; 189 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 190 const char16_t *targetLimit = args->targetLimit; 191 unsigned char *toUBytes = cnv->toUBytes; 192 UBool isCESU8 = hasCESU8Data(cnv); 193 uint32_t ch, ch2 = 0; 194 int32_t i, inBytes; 195 196 /* Restore size of current sequence */ 197 if (cnv->toULength > 0 && myTarget < targetLimit) 198 { 199 inBytes = cnv->mode; /* restore # of bytes to consume */ 200 i = cnv->toULength; /* restore # of bytes consumed */ 201 cnv->toULength = 0; 202 203 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 204 cnv->toUnicodeStatus = 0; 205 goto morebytes; 206 } 207 208 while (mySource < sourceLimit && myTarget < targetLimit) 209 { 210 ch = *(mySource++); 211 if (U8_IS_SINGLE(ch)) /* Simple case */ 212 { 213 *(myTarget++) = (char16_t) ch; 214 *(myOffsets++) = offsetNum++; 215 } 216 else 217 { 218 toUBytes[0] = (char)ch; 219 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); 220 i = 1; 221 222 morebytes: 223 while (i < inBytes) 224 { 225 if (mySource < sourceLimit) 226 { 227 toUBytes[i] = (char) (ch2 = *mySource); 228 if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) && 229 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) 230 { 231 break; /* i < inBytes */ 232 } 233 ch = (ch << 6) + ch2; 234 ++mySource; 235 i++; 236 } 237 else 238 { 239 cnv->toUnicodeStatus = ch; 240 cnv->mode = inBytes; 241 cnv->toULength = (int8_t)i; 242 goto donefornow; 243 } 244 } 245 246 // In CESU-8, only surrogates, not supplementary code points, are encoded directly. 247 if (i == inBytes && (!isCESU8 || i <= 3)) 248 { 249 /* Remove the accumulated high bits */ 250 ch -= offsetsFromUTF8[inBytes]; 251 252 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 253 if (ch <= MAXIMUM_UCS2) 254 { 255 /* fits in 16 bits */ 256 *(myTarget++) = (char16_t) ch; 257 *(myOffsets++) = offsetNum; 258 } 259 else 260 { 261 /* write out the surrogates */ 262 *(myTarget++) = U16_LEAD(ch); 263 *(myOffsets++) = offsetNum; 264 ch = U16_TRAIL(ch); 265 if (myTarget < targetLimit) 266 { 267 *(myTarget++) = (char16_t)ch; 268 *(myOffsets++) = offsetNum; 269 } 270 else 271 { 272 cnv->UCharErrorBuffer[0] = (char16_t) ch; 273 cnv->UCharErrorBufferLength = 1; 274 *err = U_BUFFER_OVERFLOW_ERROR; 275 } 276 } 277 offsetNum += i; 278 } 279 else 280 { 281 cnv->toULength = (int8_t)i; 282 *err = U_ILLEGAL_CHAR_FOUND; 283 break; 284 } 285 } 286 } 287 288 donefornow: 289 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 290 { /* End of target buffer */ 291 *err = U_BUFFER_OVERFLOW_ERROR; 292 } 293 294 args->target = myTarget; 295 args->source = (const char *) mySource; 296 args->offsets = myOffsets; 297 } 298 U_CDECL_END 299 300 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, 301 UErrorCode * err) 302 { 303 UConverter *cnv = args->converter; 304 const char16_t *mySource = args->source; 305 const char16_t *sourceLimit = args->sourceLimit; 306 uint8_t *myTarget = (uint8_t *) args->target; 307 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 308 uint8_t *tempPtr; 309 UChar32 ch; 310 uint8_t tempBuf[4]; 311 int32_t indexToWrite; 312 UBool isNotCESU8 = !hasCESU8Data(cnv); 313 314 if (cnv->fromUChar32 && myTarget < targetLimit) 315 { 316 ch = cnv->fromUChar32; 317 cnv->fromUChar32 = 0; 318 goto lowsurrogate; 319 } 320 321 while (mySource < sourceLimit && myTarget < targetLimit) 322 { 323 ch = *(mySource++); 324 325 if (ch < 0x80) /* Single byte */ 326 { 327 *(myTarget++) = (uint8_t) ch; 328 } 329 else if (ch < 0x800) /* Double byte */ 330 { 331 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 332 if (myTarget < targetLimit) 333 { 334 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 335 } 336 else 337 { 338 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 339 cnv->charErrorBufferLength = 1; 340 *err = U_BUFFER_OVERFLOW_ERROR; 341 } 342 } 343 else { 344 /* Check for surrogates */ 345 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 346 lowsurrogate: 347 if (mySource < sourceLimit) { 348 /* test both code units */ 349 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 350 /* convert and consume this supplementary code point */ 351 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 352 ++mySource; 353 /* exit this condition tree */ 354 } 355 else { 356 /* this is an unpaired trail or lead code unit */ 357 /* callback(illegal) */ 358 cnv->fromUChar32 = ch; 359 *err = U_ILLEGAL_CHAR_FOUND; 360 break; 361 } 362 } 363 else { 364 /* no more input */ 365 cnv->fromUChar32 = ch; 366 break; 367 } 368 } 369 370 /* Do we write the buffer directly for speed, 371 or do we have to be careful about target buffer space? */ 372 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 373 374 if (ch <= MAXIMUM_UCS2) { 375 indexToWrite = 2; 376 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 377 } 378 else { 379 indexToWrite = 3; 380 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 381 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 382 } 383 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 384 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 385 386 if (tempPtr == myTarget) { 387 /* There was enough space to write the codepoint directly. */ 388 myTarget += (indexToWrite + 1); 389 } 390 else { 391 /* We might run out of room soon. Write it slowly. */ 392 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 393 if (myTarget < targetLimit) { 394 *(myTarget++) = *tempPtr; 395 } 396 else { 397 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 398 *err = U_BUFFER_OVERFLOW_ERROR; 399 } 400 } 401 } 402 } 403 } 404 405 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 406 { 407 *err = U_BUFFER_OVERFLOW_ERROR; 408 } 409 410 args->target = (char *) myTarget; 411 args->source = mySource; 412 } 413 414 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, 415 UErrorCode * err) 416 { 417 UConverter *cnv = args->converter; 418 const char16_t *mySource = args->source; 419 int32_t *myOffsets = args->offsets; 420 const char16_t *sourceLimit = args->sourceLimit; 421 uint8_t *myTarget = (uint8_t *) args->target; 422 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 423 uint8_t *tempPtr; 424 UChar32 ch; 425 int32_t offsetNum, nextSourceIndex; 426 int32_t indexToWrite; 427 uint8_t tempBuf[4]; 428 UBool isNotCESU8 = !hasCESU8Data(cnv); 429 430 if (cnv->fromUChar32 && myTarget < targetLimit) 431 { 432 ch = cnv->fromUChar32; 433 cnv->fromUChar32 = 0; 434 offsetNum = -1; 435 nextSourceIndex = 0; 436 goto lowsurrogate; 437 } else { 438 offsetNum = 0; 439 } 440 441 while (mySource < sourceLimit && myTarget < targetLimit) 442 { 443 ch = *(mySource++); 444 445 if (ch < 0x80) /* Single byte */ 446 { 447 *(myOffsets++) = offsetNum++; 448 *(myTarget++) = (char) ch; 449 } 450 else if (ch < 0x800) /* Double byte */ 451 { 452 *(myOffsets++) = offsetNum; 453 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 454 if (myTarget < targetLimit) 455 { 456 *(myOffsets++) = offsetNum++; 457 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 458 } 459 else 460 { 461 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 462 cnv->charErrorBufferLength = 1; 463 *err = U_BUFFER_OVERFLOW_ERROR; 464 } 465 } 466 else 467 /* Check for surrogates */ 468 { 469 nextSourceIndex = offsetNum + 1; 470 471 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 472 lowsurrogate: 473 if (mySource < sourceLimit) { 474 /* test both code units */ 475 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 476 /* convert and consume this supplementary code point */ 477 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 478 ++mySource; 479 ++nextSourceIndex; 480 /* exit this condition tree */ 481 } 482 else { 483 /* this is an unpaired trail or lead code unit */ 484 /* callback(illegal) */ 485 cnv->fromUChar32 = ch; 486 *err = U_ILLEGAL_CHAR_FOUND; 487 break; 488 } 489 } 490 else { 491 /* no more input */ 492 cnv->fromUChar32 = ch; 493 break; 494 } 495 } 496 497 /* Do we write the buffer directly for speed, 498 or do we have to be careful about target buffer space? */ 499 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 500 501 if (ch <= MAXIMUM_UCS2) { 502 indexToWrite = 2; 503 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 504 } 505 else { 506 indexToWrite = 3; 507 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 508 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 509 } 510 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 511 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 512 513 if (tempPtr == myTarget) { 514 /* There was enough space to write the codepoint directly. */ 515 myTarget += (indexToWrite + 1); 516 myOffsets[0] = offsetNum; 517 myOffsets[1] = offsetNum; 518 myOffsets[2] = offsetNum; 519 if (indexToWrite >= 3) { 520 myOffsets[3] = offsetNum; 521 } 522 myOffsets += (indexToWrite + 1); 523 } 524 else { 525 /* We might run out of room soon. Write it slowly. */ 526 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 527 if (myTarget < targetLimit) 528 { 529 *(myOffsets++) = offsetNum; 530 *(myTarget++) = *tempPtr; 531 } 532 else 533 { 534 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 535 *err = U_BUFFER_OVERFLOW_ERROR; 536 } 537 } 538 } 539 offsetNum = nextSourceIndex; 540 } 541 } 542 543 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 544 { 545 *err = U_BUFFER_OVERFLOW_ERROR; 546 } 547 548 args->target = (char *) myTarget; 549 args->source = mySource; 550 args->offsets = myOffsets; 551 } 552 553 U_CDECL_BEGIN 554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, 555 UErrorCode *err) { 556 UConverter *cnv; 557 const uint8_t *sourceInitial; 558 const uint8_t *source; 559 uint8_t myByte; 560 UChar32 ch; 561 int8_t i; 562 563 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ 564 565 cnv = args->converter; 566 sourceInitial = source = (const uint8_t *)args->source; 567 if (source >= (const uint8_t *)args->sourceLimit) 568 { 569 /* no input */ 570 *err = U_INDEX_OUTOFBOUNDS_ERROR; 571 return 0xffff; 572 } 573 574 myByte = *(source++); 575 if (U8_IS_SINGLE(myByte)) 576 { 577 args->source = (const char *)source; 578 return (UChar32)myByte; 579 } 580 581 uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte); 582 if (countTrailBytes == 0) { 583 cnv->toUBytes[0] = myByte; 584 cnv->toULength = 1; 585 *err = U_ILLEGAL_CHAR_FOUND; 586 args->source = (const char *)source; 587 return 0xffff; 588 } 589 590 /*The byte sequence is longer than the buffer area passed*/ 591 if (((const char *)source + countTrailBytes) > args->sourceLimit) 592 { 593 /* check if all of the remaining bytes are trail bytes */ 594 uint16_t extraBytesToWrite = countTrailBytes + 1; 595 cnv->toUBytes[0] = myByte; 596 i = 1; 597 *err = U_TRUNCATED_CHAR_FOUND; 598 while(source < (const uint8_t *)args->sourceLimit) { 599 uint8_t b = *source; 600 if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) { 601 cnv->toUBytes[i++] = b; 602 ++source; 603 } else { 604 /* error even before we run out of input */ 605 *err = U_ILLEGAL_CHAR_FOUND; 606 break; 607 } 608 } 609 cnv->toULength = i; 610 args->source = (const char *)source; 611 return 0xffff; 612 } 613 614 ch = myByte << 6; 615 if(countTrailBytes == 2) { 616 uint8_t t1 = *source, t2; 617 if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) { 618 args->source = (const char *)(source + 1); 619 return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3]; 620 } 621 } else if(countTrailBytes == 1) { 622 uint8_t t1 = *source; 623 if(U8_IS_TRAIL(t1)) { 624 args->source = (const char *)(source + 1); 625 return (ch + t1) - offsetsFromUTF8[2]; 626 } 627 } else { // countTrailBytes == 3 628 uint8_t t1 = *source, t2, t3; 629 if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) && 630 U8_IS_TRAIL(t3 = *++source)) { 631 args->source = (const char *)(source + 1); 632 return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4]; 633 } 634 } 635 args->source = (const char *)source; 636 637 for(i = 0; sourceInitial < source; ++i) { 638 cnv->toUBytes[i] = *sourceInitial++; 639 } 640 cnv->toULength = i; 641 *err = U_ILLEGAL_CHAR_FOUND; 642 return 0xffff; 643 } 644 U_CDECL_END 645 646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ 647 648 U_CDECL_BEGIN 649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ 650 static void U_CALLCONV 651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 652 UConverterToUnicodeArgs *pToUArgs, 653 UErrorCode *pErrorCode) { 654 UConverter *utf8; 655 const uint8_t *source, *sourceLimit; 656 uint8_t *target; 657 int32_t targetCapacity; 658 int32_t count; 659 660 int8_t oldToULength, toULength, toULimit; 661 662 UChar32 c; 663 uint8_t b, t1, t2; 664 665 /* set up the local pointers */ 666 utf8=pToUArgs->converter; 667 source=(uint8_t *)pToUArgs->source; 668 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 669 target=(uint8_t *)pFromUArgs->target; 670 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 671 672 /* get the converter state from the UTF-8 UConverter */ 673 if(utf8->toULength > 0) { 674 toULength=oldToULength=utf8->toULength; 675 toULimit=(int8_t)utf8->mode; 676 c=(UChar32)utf8->toUnicodeStatus; 677 } else { 678 toULength=oldToULength=toULimit=0; 679 c = 0; 680 } 681 682 count=(int32_t)(sourceLimit-source)+oldToULength; 683 if(count<toULimit) { 684 /* 685 * Not enough input to complete the partial character. 686 * Jump to moreBytes below - it will not output to target. 687 */ 688 } else if(targetCapacity<toULimit) { 689 /* 690 * Not enough target capacity to output the partial character. 691 * Let the standard converter handle this. 692 */ 693 *pErrorCode=U_USING_DEFAULT_WARNING; 694 return; 695 } else { 696 // Use a single counter for source and target, counting the minimum of 697 // the source length and the target capacity. 698 // Let the standard converter handle edge cases. 699 if(count>targetCapacity) { 700 count=targetCapacity; 701 } 702 703 // The conversion loop checks count>0 only once per character. 704 // If the buffer ends with a truncated sequence, 705 // then we reduce the count to stop before that, 706 // and collect the remaining bytes after the conversion loop. 707 708 // Do not go back into the bytes that will be read for finishing a partial 709 // sequence from the previous buffer. 710 int32_t length=count-toULength; 711 U8_TRUNCATE_IF_INCOMPLETE(source, 0, length); 712 count=toULength+length; 713 } 714 715 if(c!=0) { 716 utf8->toUnicodeStatus=0; 717 utf8->toULength=0; 718 goto moreBytes; 719 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 720 } 721 722 /* conversion loop */ 723 while(count>0) { 724 b=*source++; 725 if(U8_IS_SINGLE(b)) { 726 /* convert ASCII */ 727 *target++=b; 728 --count; 729 continue; 730 } else { 731 if(b>=0xe0) { 732 if( /* handle U+0800..U+FFFF inline */ 733 b<0xf0 && 734 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && 735 U8_IS_TRAIL(t2=source[1]) 736 ) { 737 source+=2; 738 *target++=b; 739 *target++=t1; 740 *target++=t2; 741 count-=3; 742 continue; 743 } 744 } else { 745 if( /* handle U+0080..U+07FF inline */ 746 b>=0xc2 && 747 U8_IS_TRAIL(t1=*source) 748 ) { 749 ++source; 750 *target++=b; 751 *target++=t1; 752 count-=2; 753 continue; 754 } 755 } 756 757 /* handle "complicated" and error cases, and continuing partial characters */ 758 oldToULength=0; 759 toULength=1; 760 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 761 c=b; 762 moreBytes: 763 while(toULength<toULimit) { 764 if(source<sourceLimit) { 765 b=*source; 766 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 767 ++source; 768 ++toULength; 769 c=(c<<6)+b; 770 } else { 771 break; /* sequence too short, stop with toULength<toULimit */ 772 } 773 } else { 774 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 775 source-=(toULength-oldToULength); 776 while(oldToULength<toULength) { 777 utf8->toUBytes[oldToULength++]=*source++; 778 } 779 utf8->toUnicodeStatus=c; 780 utf8->toULength=toULength; 781 utf8->mode=toULimit; 782 pToUArgs->source=(char *)source; 783 pFromUArgs->target=(char *)target; 784 return; 785 } 786 } 787 788 if(toULength!=toULimit) { 789 /* error handling: illegal UTF-8 byte sequence */ 790 source-=(toULength-oldToULength); 791 while(oldToULength<toULength) { 792 utf8->toUBytes[oldToULength++]=*source++; 793 } 794 utf8->toULength=toULength; 795 pToUArgs->source=(char *)source; 796 pFromUArgs->target=(char *)target; 797 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 798 return; 799 } 800 801 /* copy the legal byte sequence to the target */ 802 { 803 int8_t i; 804 805 for(i=0; i<oldToULength; ++i) { 806 *target++=utf8->toUBytes[i]; 807 } 808 source-=(toULength-oldToULength); 809 for(; i<toULength; ++i) { 810 *target++=*source++; 811 } 812 count-=toULength; 813 } 814 } 815 } 816 U_ASSERT(count>=0); 817 818 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { 819 if(target==(const uint8_t *)pFromUArgs->targetLimit) { 820 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 821 } else { 822 b=*source; 823 toULimit=U8_COUNT_BYTES(b); 824 if(toULimit>(sourceLimit-source)) { 825 /* collect a truncated byte sequence */ 826 toULength=0; 827 c=b; 828 for(;;) { 829 utf8->toUBytes[toULength++]=b; 830 if(++source==sourceLimit) { 831 /* partial byte sequence at end of source */ 832 utf8->toUnicodeStatus=c; 833 utf8->toULength=toULength; 834 utf8->mode=toULimit; 835 break; 836 } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) { 837 utf8->toULength=toULength; 838 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 839 break; 840 } 841 c=(c<<6)+b; 842 } 843 } else { 844 /* partial-sequence target overflow: fall back to the pivoting implementation */ 845 *pErrorCode=U_USING_DEFAULT_WARNING; 846 } 847 } 848 } 849 850 /* write back the updated pointers */ 851 pToUArgs->source=(char *)source; 852 pFromUArgs->target=(char *)target; 853 } 854 855 U_CDECL_END 856 857 /* UTF-8 converter data ----------------------------------------------------- */ 858 859 static const UConverterImpl _UTF8Impl={ 860 UCNV_UTF8, 861 862 nullptr, 863 nullptr, 864 865 nullptr, 866 nullptr, 867 nullptr, 868 869 ucnv_toUnicode_UTF8, 870 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 871 ucnv_fromUnicode_UTF8, 872 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 873 ucnv_getNextUChar_UTF8, 874 875 nullptr, 876 nullptr, 877 nullptr, 878 nullptr, 879 ucnv_getNonSurrogateUnicodeSet, 880 881 ucnv_UTF8FromUTF8, 882 ucnv_UTF8FromUTF8 883 }; 884 885 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ 886 static const UConverterStaticData _UTF8StaticData={ 887 sizeof(UConverterStaticData), 888 "UTF-8", 889 1208, UCNV_IBM, UCNV_UTF8, 890 1, 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */ 891 { 0xef, 0xbf, 0xbd, 0 },3,false,false, 892 0, 893 0, 894 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 895 }; 896 897 898 const UConverterSharedData _UTF8Data= 899 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); 900 901 /* CESU-8 converter data ---------------------------------------------------- */ 902 903 static const UConverterImpl _CESU8Impl={ 904 UCNV_CESU8, 905 906 nullptr, 907 nullptr, 908 909 nullptr, 910 nullptr, 911 nullptr, 912 913 ucnv_toUnicode_UTF8, 914 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 915 ucnv_fromUnicode_UTF8, 916 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 917 nullptr, 918 919 nullptr, 920 nullptr, 921 nullptr, 922 nullptr, 923 ucnv_getCompleteUnicodeSet, 924 925 nullptr, 926 nullptr 927 }; 928 929 static const UConverterStaticData _CESU8StaticData={ 930 sizeof(UConverterStaticData), 931 "CESU-8", 932 9400, /* CCSID for CESU-8 */ 933 UCNV_UNKNOWN, UCNV_CESU8, 1, 3, 934 { 0xef, 0xbf, 0xbd, 0 },3,false,false, 935 0, 936 0, 937 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 938 }; 939 940 941 const UConverterSharedData _CESU8Data= 942 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); 943 944 #endif