uiter.cpp (32514B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uiter.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002jan18 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/ustring.h" 21 #include "unicode/chariter.h" 22 #include "unicode/rep.h" 23 #include "unicode/uiter.h" 24 #include "unicode/utf.h" 25 #include "unicode/utf8.h" 26 #include "unicode/utf16.h" 27 #include "cstring.h" 28 29 U_NAMESPACE_USE 30 31 #define IS_EVEN(n) (((n)&1)==0) 32 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) 33 34 U_CDECL_BEGIN 35 36 /* No-Op UCharIterator implementation for illegal input --------------------- */ 37 38 static int32_t U_CALLCONV 39 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { 40 return 0; 41 } 42 43 static int32_t U_CALLCONV 44 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { 45 return 0; 46 } 47 48 static UBool U_CALLCONV 49 noopHasNext(UCharIterator * /*iter*/) { 50 return false; 51 } 52 53 static UChar32 U_CALLCONV 54 noopCurrent(UCharIterator * /*iter*/) { 55 return U_SENTINEL; 56 } 57 58 static uint32_t U_CALLCONV 59 noopGetState(const UCharIterator * /*iter*/) { 60 return UITER_NO_STATE; 61 } 62 63 static void U_CALLCONV 64 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { 65 *pErrorCode=U_UNSUPPORTED_ERROR; 66 } 67 68 static const UCharIterator noopIterator={ 69 nullptr, 0, 0, 0, 0, 0, 70 noopGetIndex, 71 noopMove, 72 noopHasNext, 73 noopHasNext, 74 noopCurrent, 75 noopCurrent, 76 noopCurrent, 77 nullptr, 78 noopGetState, 79 noopSetState 80 }; 81 82 /* UCharIterator implementation for simple strings -------------------------- */ 83 84 /* 85 * This is an implementation of a code unit (char16_t) iterator 86 * for char16_t * strings. 87 * 88 * The UCharIterator.context field holds a pointer to the string. 89 */ 90 91 static int32_t U_CALLCONV 92 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 93 switch(origin) { 94 case UITER_ZERO: 95 return 0; 96 case UITER_START: 97 return iter->start; 98 case UITER_CURRENT: 99 return iter->index; 100 case UITER_LIMIT: 101 return iter->limit; 102 case UITER_LENGTH: 103 return iter->length; 104 default: 105 /* not a valid origin */ 106 /* Should never get here! */ 107 return -1; 108 } 109 } 110 111 static int32_t U_CALLCONV 112 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 113 int32_t pos; 114 115 switch(origin) { 116 case UITER_ZERO: 117 pos=delta; 118 break; 119 case UITER_START: 120 pos=iter->start+delta; 121 break; 122 case UITER_CURRENT: 123 pos=iter->index+delta; 124 break; 125 case UITER_LIMIT: 126 pos=iter->limit+delta; 127 break; 128 case UITER_LENGTH: 129 pos=iter->length+delta; 130 break; 131 default: 132 return -1; /* Error */ 133 } 134 135 if(pos<iter->start) { 136 pos=iter->start; 137 } else if(pos>iter->limit) { 138 pos=iter->limit; 139 } 140 141 return iter->index=pos; 142 } 143 144 static UBool U_CALLCONV 145 stringIteratorHasNext(UCharIterator *iter) { 146 return iter->index<iter->limit; 147 } 148 149 static UBool U_CALLCONV 150 stringIteratorHasPrevious(UCharIterator *iter) { 151 return iter->index>iter->start; 152 } 153 154 static UChar32 U_CALLCONV 155 stringIteratorCurrent(UCharIterator *iter) { 156 if(iter->index<iter->limit) { 157 return ((const char16_t *)(iter->context))[iter->index]; 158 } else { 159 return U_SENTINEL; 160 } 161 } 162 163 static UChar32 U_CALLCONV 164 stringIteratorNext(UCharIterator *iter) { 165 if(iter->index<iter->limit) { 166 return ((const char16_t *)(iter->context))[iter->index++]; 167 } else { 168 return U_SENTINEL; 169 } 170 } 171 172 static UChar32 U_CALLCONV 173 stringIteratorPrevious(UCharIterator *iter) { 174 if(iter->index>iter->start) { 175 return ((const char16_t *)(iter->context))[--iter->index]; 176 } else { 177 return U_SENTINEL; 178 } 179 } 180 181 static uint32_t U_CALLCONV 182 stringIteratorGetState(const UCharIterator *iter) { 183 return (uint32_t)iter->index; 184 } 185 186 static void U_CALLCONV 187 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 188 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 189 /* do nothing */ 190 } else if(iter==nullptr) { 191 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 192 } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { 193 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 194 } else { 195 iter->index=(int32_t)state; 196 } 197 } 198 199 static const UCharIterator stringIterator={ 200 nullptr, 0, 0, 0, 0, 0, 201 stringIteratorGetIndex, 202 stringIteratorMove, 203 stringIteratorHasNext, 204 stringIteratorHasPrevious, 205 stringIteratorCurrent, 206 stringIteratorNext, 207 stringIteratorPrevious, 208 nullptr, 209 stringIteratorGetState, 210 stringIteratorSetState 211 }; 212 213 U_CAPI void U_EXPORT2 214 uiter_setString(UCharIterator *iter, const char16_t *s, int32_t length) { 215 if (iter != nullptr) { 216 if (s != nullptr && length >= -1) { 217 *iter=stringIterator; 218 iter->context=s; 219 if(length>=0) { 220 iter->length=length; 221 } else { 222 iter->length=u_strlen(s); 223 } 224 iter->limit=iter->length; 225 } else { 226 *iter=noopIterator; 227 } 228 } 229 } 230 231 /* UCharIterator implementation for UTF-16BE strings ------------------------ */ 232 233 /* 234 * This is an implementation of a code unit (char16_t) iterator 235 * for UTF-16BE strings, i.e., strings in byte-vectors where 236 * each char16_t is stored as a big-endian pair of bytes. 237 * 238 * The UCharIterator.context field holds a pointer to the string. 239 * Everything works just like with a normal char16_t iterator (uiter_setString), 240 * except that UChars are assembled from byte pairs. 241 */ 242 243 /* internal helper function */ 244 static inline UChar32 245 utf16BEIteratorGet(UCharIterator *iter, int32_t index) { 246 const uint8_t *p=(const uint8_t *)iter->context; 247 return ((char16_t)p[2*index]<<8)|(char16_t)p[2*index+1]; 248 } 249 250 static UChar32 U_CALLCONV 251 utf16BEIteratorCurrent(UCharIterator *iter) { 252 int32_t index; 253 254 if((index=iter->index)<iter->limit) { 255 return utf16BEIteratorGet(iter, index); 256 } else { 257 return U_SENTINEL; 258 } 259 } 260 261 static UChar32 U_CALLCONV 262 utf16BEIteratorNext(UCharIterator *iter) { 263 int32_t index; 264 265 if((index=iter->index)<iter->limit) { 266 iter->index=index+1; 267 return utf16BEIteratorGet(iter, index); 268 } else { 269 return U_SENTINEL; 270 } 271 } 272 273 static UChar32 U_CALLCONV 274 utf16BEIteratorPrevious(UCharIterator *iter) { 275 int32_t index; 276 277 if((index=iter->index)>iter->start) { 278 iter->index=--index; 279 return utf16BEIteratorGet(iter, index); 280 } else { 281 return U_SENTINEL; 282 } 283 } 284 285 static const UCharIterator utf16BEIterator={ 286 nullptr, 0, 0, 0, 0, 0, 287 stringIteratorGetIndex, 288 stringIteratorMove, 289 stringIteratorHasNext, 290 stringIteratorHasPrevious, 291 utf16BEIteratorCurrent, 292 utf16BEIteratorNext, 293 utf16BEIteratorPrevious, 294 nullptr, 295 stringIteratorGetState, 296 stringIteratorSetState 297 }; 298 299 /* 300 * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL, 301 * i.e., before a pair of 0 bytes where the first 0 byte is at an even 302 * offset from s. 303 */ 304 static int32_t 305 utf16BE_strlen(const char *s) { 306 if(IS_POINTER_EVEN(s)) { 307 /* 308 * even-aligned, call u_strlen(s) 309 * we are probably on a little-endian machine, but searching for char16_t NUL 310 * does not care about endianness 311 */ 312 return u_strlen((const char16_t *)s); 313 } else { 314 /* odd-aligned, search for pair of 0 bytes */ 315 const char *p=s; 316 317 while(!(*p==0 && p[1]==0)) { 318 p+=2; 319 } 320 return (int32_t)((p-s)/2); 321 } 322 } 323 324 U_CAPI void U_EXPORT2 325 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { 326 if(iter!=nullptr) { 327 /* allow only even-length strings (the input length counts bytes) */ 328 if(s!=nullptr && (length==-1 || (length>=0 && IS_EVEN(length)))) { 329 /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ 330 length>>=1; 331 332 if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { 333 /* big-endian machine and 2-aligned UTF-16BE string: use normal char16_t iterator */ 334 uiter_setString(iter, (const char16_t *)s, length); 335 return; 336 } 337 338 *iter=utf16BEIterator; 339 iter->context=s; 340 if(length>=0) { 341 iter->length=length; 342 } else { 343 iter->length=utf16BE_strlen(s); 344 } 345 iter->limit=iter->length; 346 } else { 347 *iter=noopIterator; 348 } 349 } 350 } 351 352 /* UCharIterator wrapper around CharacterIterator --------------------------- */ 353 354 /* 355 * This is wrapper code around a C++ CharacterIterator to 356 * look like a C UCharIterator. 357 * 358 * The UCharIterator.context field holds a pointer to the CharacterIterator. 359 */ 360 361 static int32_t U_CALLCONV 362 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 363 switch(origin) { 364 case UITER_ZERO: 365 return 0; 366 case UITER_START: 367 return ((CharacterIterator *)(iter->context))->startIndex(); 368 case UITER_CURRENT: 369 return ((CharacterIterator *)(iter->context))->getIndex(); 370 case UITER_LIMIT: 371 return ((CharacterIterator *)(iter->context))->endIndex(); 372 case UITER_LENGTH: 373 return ((CharacterIterator *)(iter->context))->getLength(); 374 default: 375 /* not a valid origin */ 376 /* Should never get here! */ 377 return -1; 378 } 379 } 380 381 static int32_t U_CALLCONV 382 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 383 switch(origin) { 384 case UITER_ZERO: 385 ((CharacterIterator *)(iter->context))->setIndex(delta); 386 return ((CharacterIterator *)(iter->context))->getIndex(); 387 case UITER_START: 388 case UITER_CURRENT: 389 case UITER_LIMIT: 390 return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); 391 case UITER_LENGTH: 392 ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); 393 return ((CharacterIterator *)(iter->context))->getIndex(); 394 default: 395 /* not a valid origin */ 396 /* Should never get here! */ 397 return -1; 398 } 399 } 400 401 static UBool U_CALLCONV 402 characterIteratorHasNext(UCharIterator *iter) { 403 return ((CharacterIterator *)(iter->context))->hasNext(); 404 } 405 406 static UBool U_CALLCONV 407 characterIteratorHasPrevious(UCharIterator *iter) { 408 return ((CharacterIterator *)(iter->context))->hasPrevious(); 409 } 410 411 static UChar32 U_CALLCONV 412 characterIteratorCurrent(UCharIterator *iter) { 413 UChar32 c; 414 415 c=((CharacterIterator *)(iter->context))->current(); 416 if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { 417 return c; 418 } else { 419 return U_SENTINEL; 420 } 421 } 422 423 static UChar32 U_CALLCONV 424 characterIteratorNext(UCharIterator *iter) { 425 if(((CharacterIterator *)(iter->context))->hasNext()) { 426 return ((CharacterIterator *)(iter->context))->nextPostInc(); 427 } else { 428 return U_SENTINEL; 429 } 430 } 431 432 static UChar32 U_CALLCONV 433 characterIteratorPrevious(UCharIterator *iter) { 434 if(((CharacterIterator *)(iter->context))->hasPrevious()) { 435 return ((CharacterIterator *)(iter->context))->previous(); 436 } else { 437 return U_SENTINEL; 438 } 439 } 440 441 static uint32_t U_CALLCONV 442 characterIteratorGetState(const UCharIterator *iter) { 443 return ((CharacterIterator *)(iter->context))->getIndex(); 444 } 445 446 static void U_CALLCONV 447 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 448 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 449 /* do nothing */ 450 } else if(iter==nullptr || iter->context==nullptr) { 451 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 452 } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { 453 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 454 } else { 455 ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); 456 } 457 } 458 459 static const UCharIterator characterIteratorWrapper={ 460 nullptr, 0, 0, 0, 0, 0, 461 characterIteratorGetIndex, 462 characterIteratorMove, 463 characterIteratorHasNext, 464 characterIteratorHasPrevious, 465 characterIteratorCurrent, 466 characterIteratorNext, 467 characterIteratorPrevious, 468 nullptr, 469 characterIteratorGetState, 470 characterIteratorSetState 471 }; 472 473 U_CAPI void U_EXPORT2 474 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { 475 if (iter != nullptr) { 476 if (charIter != nullptr) { 477 *iter=characterIteratorWrapper; 478 iter->context=charIter; 479 } else { 480 *iter=noopIterator; 481 } 482 } 483 } 484 485 /* UCharIterator wrapper around Replaceable --------------------------------- */ 486 487 /* 488 * This is an implementation of a code unit (char16_t) iterator 489 * based on a Replaceable object. 490 * 491 * The UCharIterator.context field holds a pointer to the Replaceable. 492 * UCharIterator.length and UCharIterator.index hold Replaceable.length() 493 * and the iteration index. 494 */ 495 496 static UChar32 U_CALLCONV 497 replaceableIteratorCurrent(UCharIterator *iter) { 498 if(iter->index<iter->limit) { 499 return ((Replaceable *)(iter->context))->charAt(iter->index); 500 } else { 501 return U_SENTINEL; 502 } 503 } 504 505 static UChar32 U_CALLCONV 506 replaceableIteratorNext(UCharIterator *iter) { 507 if(iter->index<iter->limit) { 508 return ((Replaceable *)(iter->context))->charAt(iter->index++); 509 } else { 510 return U_SENTINEL; 511 } 512 } 513 514 static UChar32 U_CALLCONV 515 replaceableIteratorPrevious(UCharIterator *iter) { 516 if(iter->index>iter->start) { 517 return ((Replaceable *)(iter->context))->charAt(--iter->index); 518 } else { 519 return U_SENTINEL; 520 } 521 } 522 523 static const UCharIterator replaceableIterator={ 524 nullptr, 0, 0, 0, 0, 0, 525 stringIteratorGetIndex, 526 stringIteratorMove, 527 stringIteratorHasNext, 528 stringIteratorHasPrevious, 529 replaceableIteratorCurrent, 530 replaceableIteratorNext, 531 replaceableIteratorPrevious, 532 nullptr, 533 stringIteratorGetState, 534 stringIteratorSetState 535 }; 536 537 U_CAPI void U_EXPORT2 538 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { 539 if (iter != nullptr) { 540 if (rep != nullptr) { 541 *iter=replaceableIterator; 542 iter->context=rep; 543 iter->limit=iter->length=rep->length(); 544 } else { 545 *iter=noopIterator; 546 } 547 } 548 } 549 550 /* UCharIterator implementation for UTF-8 strings --------------------------- */ 551 552 /* 553 * Possible, probably necessary only for an implementation for arbitrary 554 * converters: 555 * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. 556 * This would require to turn reservedFn into a close function and 557 * to introduce a uiter_close(iter). 558 */ 559 560 #define UITER_CNV_CAPACITY 16 561 562 /* 563 * Minimal implementation: 564 * Maintain a single-char16_t buffer for an additional surrogate. 565 * The caller must not modify start and limit because they are used internally. 566 * 567 * Use UCharIterator fields as follows: 568 * context pointer to UTF-8 string 569 * length UTF-16 length of the string; -1 until lazy evaluation 570 * start current UTF-8 index 571 * index current UTF-16 index; may be -1="unknown" after setState() 572 * limit UTF-8 length of the string 573 * reservedField supplementary code point 574 * 575 * Since UCharIterator delivers 16-bit code units, the iteration can be 576 * currently in the middle of the byte sequence for a supplementary code point. 577 * In this case, reservedField will contain that code point and start will 578 * point to after the corresponding byte sequence. The UTF-16 index will be 579 * one less than what it would otherwise be corresponding to the UTF-8 index. 580 * Otherwise, reservedField will be 0. 581 */ 582 583 /* 584 * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 585 * Add implementations that do not call strlen() for iteration but check for NUL. 586 */ 587 588 static int32_t U_CALLCONV 589 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 590 switch(origin) { 591 case UITER_ZERO: 592 case UITER_START: 593 return 0; 594 case UITER_CURRENT: 595 if(iter->index<0) { 596 /* the current UTF-16 index is unknown after setState(), count from the beginning */ 597 const uint8_t *s; 598 UChar32 c; 599 int32_t i, limit, index; 600 601 s=(const uint8_t *)iter->context; 602 i=index=0; 603 limit=iter->start; /* count up to the UTF-8 index */ 604 while(i<limit) { 605 U8_NEXT_OR_FFFD(s, i, limit, c); 606 index+=U16_LENGTH(c); 607 } 608 609 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 610 if(i==iter->limit) { 611 iter->length=index; /* in case it was <0 or wrong */ 612 } 613 if(iter->reservedField!=0) { 614 --index; /* we are in the middle of a supplementary code point */ 615 } 616 iter->index=index; 617 } 618 return iter->index; 619 case UITER_LIMIT: 620 case UITER_LENGTH: 621 if(iter->length<0) { 622 const uint8_t *s; 623 UChar32 c; 624 int32_t i, limit, length; 625 626 s=(const uint8_t *)iter->context; 627 if(iter->index<0) { 628 /* 629 * the current UTF-16 index is unknown after setState(), 630 * we must first count from the beginning to here 631 */ 632 i=length=0; 633 limit=iter->start; 634 635 /* count from the beginning to the current index */ 636 while(i<limit) { 637 U8_NEXT_OR_FFFD(s, i, limit, c); 638 length+=U16_LENGTH(c); 639 } 640 641 /* assume i==limit==iter->start, set the UTF-16 index */ 642 iter->start=i; /* just in case setState() did not get us to a code point boundary */ 643 iter->index= iter->reservedField!=0 ? length-1 : length; 644 } else { 645 i=iter->start; 646 length=iter->index; 647 if(iter->reservedField!=0) { 648 ++length; 649 } 650 } 651 652 /* count from the current index to the end */ 653 limit=iter->limit; 654 while(i<limit) { 655 U8_NEXT_OR_FFFD(s, i, limit, c); 656 length+=U16_LENGTH(c); 657 } 658 iter->length=length; 659 } 660 return iter->length; 661 default: 662 /* not a valid origin */ 663 /* Should never get here! */ 664 return -1; 665 } 666 } 667 668 static int32_t U_CALLCONV 669 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED { 670 const uint8_t *s; 671 UChar32 c; 672 int32_t pos; /* requested UTF-16 index */ 673 int32_t i; /* UTF-8 index */ 674 UBool havePos; 675 676 /* calculate the requested UTF-16 index */ 677 switch(origin) { 678 case UITER_ZERO: 679 case UITER_START: 680 pos=delta; 681 havePos=true; 682 /* iter->index<0 (unknown) is possible */ 683 break; 684 case UITER_CURRENT: 685 if(iter->index>=0) { 686 pos=iter->index+delta; 687 havePos=true; 688 } else { 689 /* the current UTF-16 index is unknown after setState(), use only delta */ 690 pos=0; 691 havePos=false; 692 } 693 break; 694 case UITER_LIMIT: 695 case UITER_LENGTH: 696 if(iter->length>=0) { 697 pos=iter->length+delta; 698 havePos=true; 699 } else { 700 /* pin to the end, avoid counting the length */ 701 iter->index=-1; 702 iter->start=iter->limit; 703 iter->reservedField=0; 704 if(delta>=0) { 705 return UITER_UNKNOWN_INDEX; 706 } else { 707 /* the current UTF-16 index is unknown, use only delta */ 708 pos=0; 709 havePos=false; 710 } 711 } 712 break; 713 default: 714 return -1; /* Error */ 715 } 716 717 if(havePos) { 718 /* shortcuts: pinning to the edges of the string */ 719 if(pos<=0) { 720 iter->index=iter->start=iter->reservedField=0; 721 return 0; 722 } else if(iter->length>=0 && pos>=iter->length) { 723 iter->index=iter->length; 724 iter->start=iter->limit; 725 iter->reservedField=0; 726 return iter->index; 727 } 728 729 /* minimize the number of U8_NEXT/PREV operations */ 730 if(iter->index<0 || pos<iter->index/2) { 731 /* go forward from the start instead of backward from the current index */ 732 iter->index=iter->start=iter->reservedField=0; 733 } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 734 /* 735 * if we have the UTF-16 index and length and the new position is 736 * closer to the end than the current index, 737 * then go backward from the end instead of forward from the current index 738 */ 739 iter->index=iter->length; 740 iter->start=iter->limit; 741 iter->reservedField=0; 742 } 743 744 delta=pos-iter->index; 745 if(delta==0) { 746 return iter->index; /* nothing to do */ 747 } 748 } else { 749 /* move relative to unknown UTF-16 index */ 750 if(delta==0) { 751 return UITER_UNKNOWN_INDEX; /* nothing to do */ 752 } else if(-delta>=iter->start) { 753 /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 754 iter->index=iter->start=iter->reservedField=0; 755 return 0; 756 } else if(delta>=(iter->limit-iter->start)) { 757 /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 758 iter->index=iter->length; /* may or may not be <0 (unknown) */ 759 iter->start=iter->limit; 760 iter->reservedField=0; 761 return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; 762 } 763 } 764 765 /* delta!=0 */ 766 767 /* move towards the requested position, pin to the edges of the string */ 768 s=(const uint8_t *)iter->context; 769 pos=iter->index; /* could be <0 (unknown) */ 770 i=iter->start; 771 if(delta>0) { 772 /* go forward */ 773 int32_t limit=iter->limit; 774 if(iter->reservedField!=0) { 775 iter->reservedField=0; 776 ++pos; 777 --delta; 778 } 779 while(delta>0 && i<limit) { 780 U8_NEXT_OR_FFFD(s, i, limit, c); 781 if(c<=0xffff) { 782 ++pos; 783 --delta; 784 } else if(delta>=2) { 785 pos+=2; 786 delta-=2; 787 } else /* delta==1 */ { 788 /* stop in the middle of a supplementary code point */ 789 iter->reservedField=c; 790 ++pos; 791 break; /* delta=0; */ 792 } 793 } 794 if(i==limit) { 795 if(iter->length<0 && iter->index>=0) { 796 iter->length= iter->reservedField==0 ? pos : pos+1; 797 } else if(iter->index<0 && iter->length>=0) { 798 iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 799 } 800 } 801 } else /* delta<0 */ { 802 /* go backward */ 803 if(iter->reservedField!=0) { 804 iter->reservedField=0; 805 i-=4; /* we stayed behind the supplementary code point; go before it now */ 806 --pos; 807 ++delta; 808 } 809 while(delta<0 && i>0) { 810 U8_PREV_OR_FFFD(s, 0, i, c); 811 if(c<=0xffff) { 812 --pos; 813 ++delta; 814 } else if(delta<=-2) { 815 pos-=2; 816 delta+=2; 817 } else /* delta==-1 */ { 818 /* stop in the middle of a supplementary code point */ 819 i+=4; /* back to behind this supplementary code point for consistent state */ 820 iter->reservedField=c; 821 --pos; 822 break; /* delta=0; */ 823 } 824 } 825 } 826 827 iter->start=i; 828 if(iter->index>=0) { 829 return iter->index=pos; 830 } else { 831 /* we started with index<0 (unknown) so pos is bogus */ 832 if(i<=1) { 833 return iter->index=i; /* reached the beginning */ 834 } else { 835 /* we still don't know the UTF-16 index */ 836 return UITER_UNKNOWN_INDEX; 837 } 838 } 839 } 840 841 static UBool U_CALLCONV 842 utf8IteratorHasNext(UCharIterator *iter) { 843 return iter->start<iter->limit || iter->reservedField!=0; 844 } 845 846 static UBool U_CALLCONV 847 utf8IteratorHasPrevious(UCharIterator *iter) { 848 return iter->start>0; 849 } 850 851 static UChar32 U_CALLCONV 852 utf8IteratorCurrent(UCharIterator *iter) { 853 if(iter->reservedField!=0) { 854 return U16_TRAIL(iter->reservedField); 855 } else if(iter->start<iter->limit) { 856 const uint8_t *s=(const uint8_t *)iter->context; 857 UChar32 c; 858 int32_t i=iter->start; 859 860 U8_NEXT_OR_FFFD(s, i, iter->limit, c); 861 if(c<=0xffff) { 862 return c; 863 } else { 864 return U16_LEAD(c); 865 } 866 } else { 867 return U_SENTINEL; 868 } 869 } 870 871 static UChar32 U_CALLCONV 872 utf8IteratorNext(UCharIterator *iter) { 873 int32_t index; 874 875 if(iter->reservedField!=0) { 876 char16_t trail=U16_TRAIL(iter->reservedField); 877 iter->reservedField=0; 878 if((index=iter->index)>=0) { 879 iter->index=index+1; 880 } 881 return trail; 882 } else if(iter->start<iter->limit) { 883 const uint8_t *s=(const uint8_t *)iter->context; 884 UChar32 c; 885 886 U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c); 887 if((index=iter->index)>=0) { 888 iter->index=++index; 889 if(iter->length<0 && iter->start==iter->limit) { 890 iter->length= c<=0xffff ? index : index+1; 891 } 892 } else if(iter->start==iter->limit && iter->length>=0) { 893 iter->index= c<=0xffff ? iter->length : iter->length-1; 894 } 895 if(c<=0xffff) { 896 return c; 897 } else { 898 iter->reservedField=c; 899 return U16_LEAD(c); 900 } 901 } else { 902 return U_SENTINEL; 903 } 904 } 905 906 static UChar32 U_CALLCONV 907 utf8IteratorPrevious(UCharIterator *iter) { 908 int32_t index; 909 910 if(iter->reservedField!=0) { 911 char16_t lead=U16_LEAD(iter->reservedField); 912 iter->reservedField=0; 913 iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 914 if((index=iter->index)>0) { 915 iter->index=index-1; 916 } 917 return lead; 918 } else if(iter->start>0) { 919 const uint8_t *s=(const uint8_t *)iter->context; 920 UChar32 c; 921 922 U8_PREV_OR_FFFD(s, 0, iter->start, c); 923 if((index=iter->index)>0) { 924 iter->index=index-1; 925 } else if(iter->start<=1) { 926 iter->index= c<=0xffff ? iter->start : iter->start+1; 927 } 928 if(c<=0xffff) { 929 return c; 930 } else { 931 iter->start+=4; /* back to behind this supplementary code point for consistent state */ 932 iter->reservedField=c; 933 return U16_TRAIL(c); 934 } 935 } else { 936 return U_SENTINEL; 937 } 938 } 939 940 static uint32_t U_CALLCONV 941 utf8IteratorGetState(const UCharIterator *iter) { 942 uint32_t state=(uint32_t)(iter->start<<1); 943 if(iter->reservedField!=0) { 944 state|=1; 945 } 946 return state; 947 } 948 949 static void U_CALLCONV 950 utf8IteratorSetState(UCharIterator *iter, 951 uint32_t state, 952 UErrorCode *pErrorCode) 953 { 954 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 955 /* do nothing */ 956 } else if(iter==nullptr) { 957 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 958 } else if(state==utf8IteratorGetState(iter)) { 959 /* setting to the current state: no-op */ 960 } else { 961 int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 962 state&=1; /* 1 if in surrogate pair, must be index>=4 */ 963 964 if((state==0 ? index<0 : index<4) || iter->limit<index) { 965 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 966 } else { 967 iter->start=index; /* restore UTF-8 byte index */ 968 if(index<=1) { 969 iter->index=index; 970 } else { 971 iter->index=-1; /* unknown UTF-16 index */ 972 } 973 if(state==0) { 974 iter->reservedField=0; 975 } else { 976 /* verified index>=4 above */ 977 UChar32 c; 978 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c); 979 if(c<=0xffff) { 980 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 981 } else { 982 iter->reservedField=c; 983 } 984 } 985 } 986 } 987 } 988 989 static const UCharIterator utf8Iterator={ 990 nullptr, 0, 0, 0, 0, 0, 991 utf8IteratorGetIndex, 992 utf8IteratorMove, 993 utf8IteratorHasNext, 994 utf8IteratorHasPrevious, 995 utf8IteratorCurrent, 996 utf8IteratorNext, 997 utf8IteratorPrevious, 998 nullptr, 999 utf8IteratorGetState, 1000 utf8IteratorSetState 1001 }; 1002 1003 U_CAPI void U_EXPORT2 1004 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { 1005 if (iter != nullptr) { 1006 if (s != nullptr && length >= -1) { 1007 *iter=utf8Iterator; 1008 iter->context=s; 1009 if(length>=0) { 1010 iter->limit=length; 1011 } else { 1012 iter->limit=(int32_t)uprv_strlen(s); 1013 } 1014 iter->length= iter->limit<=1 ? iter->limit : -1; 1015 } else { 1016 *iter=noopIterator; 1017 } 1018 } 1019 } 1020 1021 /* Helper functions --------------------------------------------------------- */ 1022 1023 U_CAPI UChar32 U_EXPORT2 1024 uiter_current32(UCharIterator *iter) { 1025 UChar32 c, c2; 1026 1027 c=iter->current(iter); 1028 if(U16_IS_SURROGATE(c)) { 1029 if(U16_IS_SURROGATE_LEAD(c)) { 1030 /* 1031 * go to the next code unit 1032 * we know that we are not at the limit because c!=U_SENTINEL 1033 */ 1034 iter->move(iter, 1, UITER_CURRENT); 1035 if(U16_IS_TRAIL(c2=iter->current(iter))) { 1036 c=U16_GET_SUPPLEMENTARY(c, c2); 1037 } 1038 1039 /* undo index movement */ 1040 iter->move(iter, -1, UITER_CURRENT); 1041 } else { 1042 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1043 c=U16_GET_SUPPLEMENTARY(c2, c); 1044 } 1045 if(c2>=0) { 1046 /* undo index movement */ 1047 iter->move(iter, 1, UITER_CURRENT); 1048 } 1049 } 1050 } 1051 return c; 1052 } 1053 1054 U_CAPI UChar32 U_EXPORT2 1055 uiter_next32(UCharIterator *iter) { 1056 UChar32 c, c2; 1057 1058 c=iter->next(iter); 1059 if(U16_IS_LEAD(c)) { 1060 if(U16_IS_TRAIL(c2=iter->next(iter))) { 1061 c=U16_GET_SUPPLEMENTARY(c, c2); 1062 } else if(c2>=0) { 1063 /* unmatched first surrogate, undo index movement */ 1064 iter->move(iter, -1, UITER_CURRENT); 1065 } 1066 } 1067 return c; 1068 } 1069 1070 U_CAPI UChar32 U_EXPORT2 1071 uiter_previous32(UCharIterator *iter) { 1072 UChar32 c, c2; 1073 1074 c=iter->previous(iter); 1075 if(U16_IS_TRAIL(c)) { 1076 if(U16_IS_LEAD(c2=iter->previous(iter))) { 1077 c=U16_GET_SUPPLEMENTARY(c2, c); 1078 } else if(c2>=0) { 1079 /* unmatched second surrogate, undo index movement */ 1080 iter->move(iter, 1, UITER_CURRENT); 1081 } 1082 } 1083 return c; 1084 } 1085 1086 U_CAPI uint32_t U_EXPORT2 1087 uiter_getState(const UCharIterator *iter) { 1088 if(iter==nullptr || iter->getState==nullptr) { 1089 return UITER_NO_STATE; 1090 } else { 1091 return iter->getState(iter); 1092 } 1093 } 1094 1095 U_CAPI void U_EXPORT2 1096 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1097 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { 1098 /* do nothing */ 1099 } else if(iter==nullptr) { 1100 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1101 } else if(iter->setState==nullptr) { 1102 *pErrorCode=U_UNSUPPORTED_ERROR; 1103 } else { 1104 iter->setState(iter, state, pErrorCode); 1105 } 1106 } 1107 1108 U_CDECL_END