utfiterator.h (97060B)
1 // © 2024 and later: Unicode, Inc. and others. 2 // License & terms of use: https://www.unicode.org/copyright.html 3 4 // utfiterator.h 5 // created: 2024aug12 Markus W. Scherer 6 7 #ifndef __UTFITERATOR_H__ 8 #define __UTFITERATOR_H__ 9 10 #include "unicode/utypes.h" 11 12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H) 13 14 #include <iterator> 15 #if defined(__cpp_lib_ranges) 16 #include <ranges> 17 #endif 18 #include <string> 19 #include <string_view> 20 #include <type_traits> 21 #include "unicode/utf16.h" 22 #include "unicode/utf8.h" 23 #include "unicode/uversion.h" 24 25 /** 26 * \file 27 * \brief C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed). 28 * 29 * Sample code: 30 * \code 31 * #include <string_view> 32 * #include <iostream> 33 * #include "unicode/utypes.h" 34 * #include "unicode/utfiterator.h" 35 * 36 * using icu::header::utfIterator; 37 * using icu::header::utfStringCodePoints; 38 * using icu::header::unsafeUTFIterator; 39 * using icu::header::unsafeUTFStringCodePoints; 40 * 41 * int32_t rangeLoop16(std::u16string_view s) { 42 * // We are just adding up the code points for minimal-code demonstration purposes. 43 * int32_t sum = 0; 44 * for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) { 45 * sum += units.codePoint(); // < 0 if ill-formed 46 * } 47 * return sum; 48 * } 49 * 50 * int32_t loopIterPlusPlus16(std::u16string_view s) { 51 * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); 52 * int32_t sum = 0; 53 * for (auto iter = range.begin(), limit = range.end(); iter != limit;) { 54 * sum += (*iter++).codePoint(); // U+FFFD if ill-formed 55 * } 56 * return sum; 57 * } 58 * 59 * int32_t backwardLoop16(std::u16string_view s) { 60 * auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s); 61 * int32_t sum = 0; 62 * for (auto start = range.begin(), iter = range.end(); start != iter;) { 63 * sum += (*--iter).codePoint(); // surrogate code point if unpaired / ill-formed 64 * } 65 * return sum; 66 * } 67 * 68 * int32_t reverseLoop8(std::string_view s) { 69 * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); 70 * int32_t sum = 0; 71 * for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) { 72 * sum += iter->codePoint(); // U+FFFD if ill-formed 73 * } 74 * return sum; 75 * } 76 * 77 * int32_t countCodePoints16(std::u16string_view s) { 78 * auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s); 79 * return std::distance(range.begin(), range.end()); 80 * } 81 * 82 * int32_t unsafeRangeLoop16(std::u16string_view s) { 83 * int32_t sum = 0; 84 * for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) { 85 * sum += units.codePoint(); 86 * } 87 * return sum; 88 * } 89 * 90 * int32_t unsafeReverseLoop8(std::string_view s) { 91 * auto range = unsafeUTFStringCodePoints<UChar32>(s); 92 * int32_t sum = 0; 93 * for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) { 94 * sum += iter->codePoint(); 95 * } 96 * return sum; 97 * } 98 * 99 * char32_t firstCodePointOrFFFD16(std::u16string_view s) { 100 * if (s.empty()) { return 0xfffd; } 101 * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); 102 * return range.begin()->codePoint(); 103 * } 104 * 105 * std::string_view firstSequence8(std::string_view s) { 106 * if (s.empty()) { return {}; } 107 * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); 108 * auto units = *(range.begin()); 109 * if (units.wellFormed()) { 110 * return units.stringView(); 111 * } else { 112 * return {}; 113 * } 114 * } 115 * 116 * template<typename InputStream> // some istream or streambuf 117 * std::u32string cpFromInput(InputStream &in) { 118 * // This is a single-pass input_iterator. 119 * std::istreambuf_iterator bufIter(in); 120 * std::istreambuf_iterator<typename InputStream::char_type> bufLimit; 121 * auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter); 122 * auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit); 123 * std::u32string s32; 124 * for (; iter != limit; ++iter) { 125 * s32.push_back(iter->codePoint()); 126 * } 127 * return s32; 128 * } 129 * 130 * std::u32string cpFromStdin() { return cpFromInput(std::cin); } 131 * std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); } 132 * \endcode 133 */ 134 135 #ifndef U_HIDE_DRAFT_API 136 137 /** 138 * Some defined behaviors for handling ill-formed Unicode strings. 139 * This is a template parameter for UTFIterator and related classes. 140 * 141 * When a validating UTFIterator encounters an ill-formed code unit sequence, 142 * then CodeUnits.codePoint() is a value according to this parameter. 143 * 144 * @draft ICU 78 145 * @see CodeUnits 146 * @see UTFIterator 147 * @see UTFStringCodePoints 148 */ 149 typedef enum UTFIllFormedBehavior { 150 /** 151 * Returns a negative value (-1=U_SENTINEL) instead of a code point. 152 * If the CP32 template parameter for the relevant classes is an unsigned type, 153 * then the negative value becomes 0xffffffff=UINT32_MAX. 154 * 155 * @draft ICU 78 156 */ 157 UTF_BEHAVIOR_NEGATIVE, 158 /** Returns U+FFFD Replacement Character. @draft ICU 78 */ 159 UTF_BEHAVIOR_FFFD, 160 /** 161 * UTF-8: Not allowed; 162 * UTF-16: returns the unpaired surrogate; 163 * UTF-32: returns the surrogate code point, or U+FFFD if out of range. 164 * 165 * @draft ICU 78 166 */ 167 UTF_BEHAVIOR_SURROGATE 168 } UTFIllFormedBehavior; 169 170 namespace U_HEADER_ONLY_NAMESPACE { 171 172 namespace prv { 173 #if U_CPLUSPLUS_VERSION >= 20 174 175 /** @internal */ 176 template<typename Iter> 177 using iter_value_t = typename std::iter_value_t<Iter>; 178 179 /** @internal */ 180 template<typename Iter> 181 using iter_difference_t = std::iter_difference_t<Iter>; 182 183 /** @internal */ 184 template<typename Iter> 185 constexpr bool forward_iterator = std::forward_iterator<Iter>; 186 187 /** @internal */ 188 template<typename Iter> 189 constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>; 190 191 /** @internal */ 192 template<typename Range> 193 constexpr bool range = std::ranges::range<Range>; 194 195 #else 196 197 /** @internal */ 198 template<typename Iter> 199 using iter_value_t = typename std::iterator_traits<Iter>::value_type; 200 201 /** @internal */ 202 template<typename Iter> 203 using iter_difference_t = typename std::iterator_traits<Iter>::difference_type; 204 205 /** @internal */ 206 template<typename Iter> 207 constexpr bool forward_iterator = 208 std::is_base_of_v< 209 std::forward_iterator_tag, 210 typename std::iterator_traits<Iter>::iterator_category>; 211 212 /** @internal */ 213 template<typename Iter> 214 constexpr bool bidirectional_iterator = 215 std::is_base_of_v< 216 std::bidirectional_iterator_tag, 217 typename std::iterator_traits<Iter>::iterator_category>; 218 219 /** @internal */ 220 template<typename Range, typename = void> 221 struct range_type : std::false_type {}; 222 223 /** @internal */ 224 template<typename Range> 225 struct range_type< 226 Range, 227 std::void_t<decltype(std::declval<Range>().begin()), 228 decltype(std::declval<Range>().end())>> : std::true_type {}; 229 230 /** @internal */ 231 template<typename Range> 232 constexpr bool range = range_type<Range>::value; 233 234 #endif 235 236 /** @internal */ 237 template <typename T> struct is_basic_string_view : std::false_type {}; 238 239 /** @internal */ 240 template <typename... Args> 241 struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {}; 242 243 /** @internal */ 244 template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value; 245 246 /** @internal */ 247 template<typename CP32, bool skipSurrogates> 248 class CodePointsIterator { 249 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 250 public: 251 /** C++ iterator boilerplate @internal */ 252 using value_type = CP32; 253 /** C++ iterator boilerplate @internal */ 254 using reference = value_type; 255 /** C++ iterator boilerplate @internal */ 256 using pointer = CP32 *; 257 /** C++ iterator boilerplate @internal */ 258 using difference_type = int32_t; 259 /** C++ iterator boilerplate @internal */ 260 using iterator_category = std::forward_iterator_tag; 261 262 /** @internal */ 263 inline CodePointsIterator(CP32 c) : c_(c) {} 264 /** @internal */ 265 inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; } 266 /** @internal */ 267 inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); } 268 /** @internal */ 269 inline CP32 operator*() const { return c_; } 270 /** @internal */ 271 inline CodePointsIterator &operator++() { // pre-increment 272 ++c_; 273 if (skipSurrogates && c_ == 0xd800) { 274 c_ = 0xe000; 275 } 276 return *this; 277 } 278 /** @internal */ 279 inline CodePointsIterator operator++(int) { // post-increment 280 CodePointsIterator result(*this); 281 ++(*this); 282 return result; 283 } 284 285 private: 286 CP32 c_; 287 }; 288 289 } // namespace prv 290 291 /** 292 * A C++ "range" over all Unicode code points U+0000..U+10FFFF. 293 * https://www.unicode.org/glossary/#code_point 294 * 295 * Intended for test and builder code. 296 * 297 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 298 * @draft ICU 78 299 * @see U_IS_CODE_POINT 300 */ 301 template<typename CP32> 302 class AllCodePoints { 303 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 304 public: 305 /** Constructor. @draft ICU 78 */ 306 AllCodePoints() {} 307 /** 308 * @return an iterator over all Unicode code points. 309 * The iterator returns CP32 integers. 310 * @draft ICU 78 311 */ 312 auto begin() const { return prv::CodePointsIterator<CP32, false>(0); } 313 /** 314 * @return an exclusive-end iterator over all Unicode code points. 315 * @draft ICU 78 316 */ 317 auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); } 318 }; 319 320 /** 321 * A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF. 322 * That is, all code points except surrogates. 323 * Only scalar values can be represented in well-formed UTF-8/16/32. 324 * https://www.unicode.org/glossary/#unicode_scalar_value 325 * 326 * Intended for test and builder code. 327 * 328 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 329 * @draft ICU 78 330 * @see U_IS_SCALAR_VALUE 331 */ 332 template<typename CP32> 333 class AllScalarValues { 334 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 335 public: 336 /** Constructor. @draft ICU 78 */ 337 AllScalarValues() {} 338 /** 339 * @return an iterator over all Unicode scalar values. 340 * The iterator returns CP32 integers. 341 * @draft ICU 78 342 */ 343 auto begin() const { return prv::CodePointsIterator<CP32, true>(0); } 344 /** 345 * @return an exclusive-end iterator over all Unicode scalar values. 346 * @draft ICU 78 347 */ 348 auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); } 349 }; 350 351 /** 352 * Result of decoding a code unit sequence for one code point. 353 * Returned from non-validating Unicode string code point iterators. 354 * Base class for class CodeUnits which is returned from validating iterators. 355 * 356 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; 357 * should be signed if UTF_BEHAVIOR_NEGATIVE 358 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: 359 * UTF-8: char or char8_t or uint8_t; 360 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 361 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 362 * @see UnsafeUTFIterator 363 * @see UnsafeUTFStringCodePoints 364 * @draft ICU 78 365 */ 366 template<typename CP32, typename UnitIter, typename = void> 367 class UnsafeCodeUnits { 368 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 369 using Unit = typename prv::iter_value_t<UnitIter>; 370 public: 371 /** @internal */ 372 UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) : 373 c_(codePoint), len_(length), start_(start), limit_(limit) {} 374 375 /** Copy constructor. @draft ICU 78 */ 376 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; 377 /** Copy assignment operator. @draft ICU 78 */ 378 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; 379 380 /** 381 * @return the Unicode code point decoded from the code unit sequence. 382 * If the sequence is ill-formed and the iterator validates, 383 * then this is a replacement value according to the iterator‘s 384 * UTFIllFormedBehavior template parameter. 385 * @draft ICU 78 386 */ 387 CP32 codePoint() const { return c_; } 388 389 /** 390 * @return the start of the code unit sequence for one code point. 391 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. 392 * @draft ICU 78 393 */ 394 UnitIter begin() const { return start_; } 395 396 /** 397 * @return the limit (exclusive end) of the code unit sequence for one code point. 398 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. 399 * @draft ICU 78 400 */ 401 UnitIter end() const { return limit_; } 402 403 /** 404 * @return the length of the code unit sequence for one code point. 405 * @draft ICU 78 406 */ 407 uint8_t length() const { return len_; } 408 409 #if U_CPLUSPLUS_VERSION >= 20 410 /** 411 * @return a string_view of the code unit sequence for one code point. 412 * Only works if UnitIter is a pointer or a contiguous_iterator. 413 * @draft ICU 78 414 */ 415 template<std::contiguous_iterator Iter = UnitIter> 416 std::basic_string_view<Unit> stringView() const { 417 return std::basic_string_view<Unit>(begin(), end()); 418 } 419 #else 420 /** 421 * @return a string_view of the code unit sequence for one code point. 422 * Only works if UnitIter is a pointer or a contiguous_iterator. 423 * @draft ICU 78 424 */ 425 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type> 426 std::enable_if_t<std::is_pointer_v<Iter> || 427 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> || 428 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> || 429 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> || 430 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>, 431 std::basic_string_view<Unit>> 432 stringView() const { 433 return std::basic_string_view<Unit>(&*start_, len_); 434 } 435 #endif 436 437 private: 438 // Order of fields with padding and access frequency in mind. 439 CP32 c_; 440 uint8_t len_; 441 UnitIter start_; 442 UnitIter limit_; 443 }; 444 445 #ifndef U_IN_DOXYGEN 446 // Partial template specialization for single-pass input iterator. 447 // No UnitIter field, no getter for it, no stringView(). 448 template<typename CP32, typename UnitIter> 449 class UnsafeCodeUnits< 450 CP32, 451 UnitIter, 452 std::enable_if_t<!prv::forward_iterator<UnitIter>>> { 453 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 454 public: 455 UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {} 456 457 UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; 458 UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; 459 460 CP32 codePoint() const { return c_; } 461 462 uint8_t length() const { return len_; } 463 464 private: 465 // Order of fields with padding and access frequency in mind. 466 CP32 c_; 467 uint8_t len_; 468 }; 469 #endif // U_IN_DOXYGEN 470 471 /** 472 * Result of validating and decoding a code unit sequence for one code point. 473 * Returned from validating Unicode string code point iterators. 474 * Adds function wellFormed() to base class UnsafeCodeUnits. 475 * 476 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; 477 * should be signed if UTF_BEHAVIOR_NEGATIVE 478 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: 479 * UTF-8: char or char8_t or uint8_t; 480 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 481 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 482 * @see UTFIterator 483 * @see UTFStringCodePoints 484 * @draft ICU 78 485 */ 486 template<typename CP32, typename UnitIter, typename = void> 487 class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> { 488 public: 489 /** @internal */ 490 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) : 491 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {} 492 493 /** Copy constructor. @draft ICU 78 */ 494 CodeUnits(const CodeUnits &other) = default; 495 /** Copy assignment operator. @draft ICU 78 */ 496 CodeUnits &operator=(const CodeUnits &other) = default; 497 498 /** 499 * @return true if the decoded code unit sequence is well-formed. 500 * @draft ICU 78 501 */ 502 bool wellFormed() const { return ok_; } 503 504 private: 505 bool ok_; 506 }; 507 508 #ifndef U_IN_DOXYGEN 509 // Partial template specialization for single-pass input iterator. 510 // No UnitIter field, no getter for it, no stringView(). 511 template<typename CP32, typename UnitIter> 512 class CodeUnits< 513 CP32, 514 UnitIter, 515 std::enable_if_t<!prv::forward_iterator<UnitIter>>> : 516 public UnsafeCodeUnits<CP32, UnitIter> { 517 public: 518 CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) : 519 UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {} 520 521 CodeUnits(const CodeUnits &other) = default; 522 CodeUnits &operator=(const CodeUnits &other) = default; 523 524 bool wellFormed() const { return ok_; } 525 526 private: 527 bool ok_; 528 }; 529 #endif // U_IN_DOXYGEN 530 531 // Validating implementations ---------------------------------------------- *** 532 533 #ifndef U_IN_DOXYGEN 534 template<typename CP32, UTFIllFormedBehavior behavior, 535 typename UnitIter, typename LimitIter = UnitIter, typename = void> 536 class UTFImpl; 537 538 // Note: readAndInc() functions take both a p0 and a p iterator. 539 // They must have the same value. 540 // For a multi-pass UnitIter, the caller must copy its p into a local variable p0, 541 // and readAndInc() copies p0 and the incremented p into the CodeUnits. 542 // For a single-pass UnitIter, which may not be default-constructible nor coypable, 543 // the caller can pass p into both references, and readAndInc() does not use p0 544 // and constructs CodeUnits without them. 545 // Moving the p0 variable into the call site avoids having to declare it inside readAndInc() 546 // which may not be possible for a single-pass iterator. 547 548 // UTF-8 549 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> 550 class UTFImpl< 551 CP32, behavior, 552 UnitIter, LimitIter, 553 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> { 554 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 555 static_assert(behavior != UTF_BEHAVIOR_SURROGATE, 556 "For 8-bit strings, the SURROGATE option does not have an equivalent."); 557 public: 558 // Handle ill-formed UTF-8 559 U_FORCE_INLINE static CP32 sub() { 560 switch (behavior) { 561 case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL; 562 case UTF_BEHAVIOR_FFFD: return 0xfffd; 563 } 564 } 565 566 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) { 567 // Very similar to U8_FWD_1(). 568 uint8_t b = *p; 569 ++p; 570 if (U8_IS_LEAD(b) && p != limit) { 571 uint8_t t1 = *p; 572 if ((0xe0 <= b && b < 0xf0)) { 573 if (U8_IS_VALID_LEAD3_AND_T1(b, t1) && 574 ++p != limit && U8_IS_TRAIL(*p)) { 575 ++p; 576 } 577 } else if (b < 0xe0) { 578 if (U8_IS_TRAIL(t1)) { 579 ++p; 580 } 581 } else /* b >= 0xf0 */ { 582 if (U8_IS_VALID_LEAD4_AND_T1(b, t1) && 583 ++p != limit && U8_IS_TRAIL(*p) && 584 ++p != limit && U8_IS_TRAIL(*p)) { 585 ++p; 586 } 587 } 588 } 589 } 590 591 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) { 592 // Very similar to U8_BACK_1(). 593 uint8_t c = *--p; 594 if (U8_IS_TRAIL(c) && p != start) { 595 UnitIter p1 = p; 596 uint8_t b1 = *--p1; 597 if (U8_IS_LEAD(b1)) { 598 if (b1 < 0xe0 || 599 (b1 < 0xf0 ? 600 U8_IS_VALID_LEAD3_AND_T1(b1, c) : 601 U8_IS_VALID_LEAD4_AND_T1(b1, c))) { 602 p = p1; 603 return; 604 } 605 } else if (U8_IS_TRAIL(b1) && p1 != start) { 606 uint8_t b2 = *--p1; 607 if (0xe0 <= b2 && b2 <= 0xf4) { 608 if (b2 < 0xf0 ? 609 U8_IS_VALID_LEAD3_AND_T1(b2, b1) : 610 U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { 611 p = p1; 612 return; 613 } 614 } else if (U8_IS_TRAIL(b2) && p1 != start) { 615 uint8_t b3 = *--p1; 616 if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { 617 p = p1; 618 return; 619 } 620 } 621 } 622 } 623 } 624 625 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( 626 UnitIter &p0, UnitIter &p, const LimitIter &limit) { 627 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 628 // Very similar to U8_NEXT_OR_FFFD(). 629 CP32 c = uint8_t(*p); 630 ++p; 631 if (U8_IS_SINGLE(c)) { 632 if constexpr (isMultiPass) { 633 return {c, 1, true, p0, p}; 634 } else { 635 return {c, 1, true}; 636 } 637 } 638 uint8_t length = 1; 639 uint8_t t = 0; 640 if (p != limit && 641 // fetch/validate/assemble all but last trail byte 642 (c >= 0xe0 ? 643 (c < 0xf0 ? // U+0800..U+FFFF except surrogates 644 U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) && 645 (t &= 0x3f, 1) 646 : // U+10000..U+10FFFF 647 (c -= 0xf0) <= 4 && 648 U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) && 649 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) && 650 (t = *p - 0x80) <= 0x3f) && 651 // valid second-to-last trail byte 652 (c = (c << 6) | t, ++length, ++p != limit) 653 : // U+0080..U+07FF 654 c >= 0xc2 && (c &= 0x1f, 1)) && 655 // last trail byte 656 (t = *p - 0x80) <= 0x3f) { 657 c = (c << 6) | t; 658 ++length; 659 ++p; 660 if constexpr (isMultiPass) { 661 return {c, length, true, p0, p}; 662 } else { 663 return {c, length, true}; 664 } 665 } 666 if constexpr (isMultiPass) { 667 return {sub(), length, false, p0, p}; 668 } else { 669 return {sub(), length, false}; 670 } 671 } 672 673 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) { 674 // Very similar to U8_PREV_OR_FFFD(). 675 UnitIter p0 = p; 676 CP32 c = uint8_t(*--p); 677 if (U8_IS_SINGLE(c)) { 678 return {c, 1, true, p, p0}; 679 } 680 if (U8_IS_TRAIL(c) && p != start) { 681 UnitIter p1 = p; 682 uint8_t b1 = *--p1; 683 if (U8_IS_LEAD(b1)) { 684 if (b1 < 0xe0) { 685 p = p1; 686 c = ((b1 - 0xc0) << 6) | (c & 0x3f); 687 return {c, 2, true, p, p0}; 688 } else if (b1 < 0xf0 ? 689 U8_IS_VALID_LEAD3_AND_T1(b1, c) : 690 U8_IS_VALID_LEAD4_AND_T1(b1, c)) { 691 // Truncated 3- or 4-byte sequence. 692 p = p1; 693 return {sub(), 2, false, p, p0}; 694 } 695 } else if (U8_IS_TRAIL(b1) && p1 != start) { 696 // Extract the value bits from the last trail byte. 697 c &= 0x3f; 698 uint8_t b2 = *--p1; 699 if (0xe0 <= b2 && b2 <= 0xf4) { 700 if (b2 < 0xf0) { 701 b2 &= 0xf; 702 if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 703 p = p1; 704 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c; 705 return {c, 3, true, p, p0}; 706 } 707 } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { 708 // Truncated 4-byte sequence. 709 p = p1; 710 return {sub(), 3, false, p, p0}; 711 } 712 } else if (U8_IS_TRAIL(b2) && p1 != start) { 713 uint8_t b3 = *--p1; 714 if (0xf0 <= b3 && b3 <= 0xf4) { 715 b3 &= 7; 716 if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { 717 p = p1; 718 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c; 719 return {c, 4, true, p, p0}; 720 } 721 } 722 } 723 } 724 } 725 return {sub(), 1, false, p, p0}; 726 } 727 }; 728 729 // UTF-16 730 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> 731 class UTFImpl< 732 CP32, behavior, 733 UnitIter, LimitIter, 734 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> { 735 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 736 public: 737 // Handle ill-formed UTF-16: One unpaired surrogate. 738 U_FORCE_INLINE static CP32 sub(CP32 surrogate) { 739 switch (behavior) { 740 case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL; 741 case UTF_BEHAVIOR_FFFD: return 0xfffd; 742 case UTF_BEHAVIOR_SURROGATE: return surrogate; 743 } 744 } 745 746 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) { 747 // Very similar to U16_FWD_1(). 748 auto c = *p; 749 ++p; 750 if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { 751 ++p; 752 } 753 } 754 755 U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) { 756 // Very similar to U16_BACK_1(). 757 UnitIter p1; 758 if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) { 759 p = p1; 760 } 761 } 762 763 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( 764 UnitIter &p0, UnitIter &p, const LimitIter &limit) { 765 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 766 // Very similar to U16_NEXT_OR_FFFD(). 767 CP32 c = static_cast<CP32>(*p); 768 ++p; 769 if (!U16_IS_SURROGATE(c)) { 770 if constexpr (isMultiPass) { 771 return {c, 1, true, p0, p}; 772 } else { 773 return {c, 1, true}; 774 } 775 } else { 776 uint16_t c2; 777 if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) { 778 ++p; 779 c = U16_GET_SUPPLEMENTARY(c, c2); 780 if constexpr (isMultiPass) { 781 return {c, 2, true, p0, p}; 782 } else { 783 return {c, 2, true}; 784 } 785 } else { 786 if constexpr (isMultiPass) { 787 return {sub(c), 1, false, p0, p}; 788 } else { 789 return {sub(c), 1, false}; 790 } 791 } 792 } 793 } 794 795 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) { 796 // Very similar to U16_PREV_OR_FFFD(). 797 UnitIter p0 = p; 798 CP32 c = static_cast<CP32>(*--p); 799 if (!U16_IS_SURROGATE(c)) { 800 return {c, 1, true, p, p0}; 801 } else { 802 UnitIter p1; 803 uint16_t c2; 804 if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) { 805 p = p1; 806 c = U16_GET_SUPPLEMENTARY(c2, c); 807 return {c, 2, true, p, p0}; 808 } else { 809 return {sub(c), 1, false, p, p0}; 810 } 811 } 812 } 813 }; 814 815 // UTF-32: trivial, but still validating 816 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> 817 class UTFImpl< 818 CP32, behavior, 819 UnitIter, LimitIter, 820 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> { 821 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 822 public: 823 // Handle ill-formed UTF-32 824 U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) { 825 switch (behavior) { 826 case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL; 827 case UTF_BEHAVIOR_FFFD: return 0xfffd; 828 case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd; 829 } 830 } 831 832 U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) { 833 ++p; 834 } 835 836 U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) { 837 --p; 838 } 839 840 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( 841 UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) { 842 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 843 uint32_t uc = *p; 844 CP32 c = uc; 845 ++p; 846 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { 847 if constexpr (isMultiPass) { 848 return {c, 1, true, p0, p}; 849 } else { 850 return {c, 1, true}; 851 } 852 } else { 853 if constexpr (isMultiPass) { 854 return {sub(uc < 0xe000, c), 1, false, p0, p}; 855 } else { 856 return {sub(uc < 0xe000, c), 1, false}; 857 } 858 } 859 } 860 861 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) { 862 UnitIter p0 = p; 863 uint32_t uc = *--p; 864 CP32 c = uc; 865 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { 866 return {c, 1, true, p, p0}; 867 } else { 868 return {sub(uc < 0xe000, c), 1, false, p, p0}; 869 } 870 } 871 }; 872 873 // Non-validating implementations ------------------------------------------ *** 874 875 template<typename CP32, typename UnitIter, typename = void> 876 class UnsafeUTFImpl; 877 878 // UTF-8 879 template<typename CP32, typename UnitIter> 880 class UnsafeUTFImpl< 881 CP32, 882 UnitIter, 883 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> { 884 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 885 public: 886 U_FORCE_INLINE static void inc(UnitIter &p) { 887 // Very similar to U8_FWD_1_UNSAFE(). 888 uint8_t b = *p; 889 std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b)); 890 } 891 892 U_FORCE_INLINE static void dec(UnitIter &p) { 893 // Very similar to U8_BACK_1_UNSAFE(). 894 while (U8_IS_TRAIL(*--p)) {} 895 } 896 897 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { 898 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 899 // Very similar to U8_NEXT_UNSAFE(). 900 CP32 c = uint8_t(*p); 901 ++p; 902 if (U8_IS_SINGLE(c)) { 903 if constexpr (isMultiPass) { 904 return {c, 1, p0, p}; 905 } else { 906 return {c, 1}; 907 } 908 } else if (c < 0xe0) { 909 c = ((c & 0x1f) << 6) | (*p & 0x3f); 910 ++p; 911 if constexpr (isMultiPass) { 912 return {c, 2, p0, p}; 913 } else { 914 return {c, 2}; 915 } 916 } else if (c < 0xf0) { 917 // No need for (c&0xf) because the upper bits are truncated 918 // after <<12 in the cast to uint16_t. 919 c = uint16_t(c << 12) | ((*p & 0x3f) << 6); 920 ++p; 921 c |= *p & 0x3f; 922 ++p; 923 if constexpr (isMultiPass) { 924 return {c, 3, p0, p}; 925 } else { 926 return {c, 3}; 927 } 928 } else { 929 c = ((c & 7) << 18) | ((*p & 0x3f) << 12); 930 ++p; 931 c |= (*p & 0x3f) << 6; 932 ++p; 933 c |= *p & 0x3f; 934 ++p; 935 if constexpr (isMultiPass) { 936 return {c, 4, p0, p}; 937 } else { 938 return {c, 4}; 939 } 940 } 941 } 942 943 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { 944 // Very similar to U8_PREV_UNSAFE(). 945 UnitIter p0 = p; 946 CP32 c = uint8_t(*--p); 947 if (U8_IS_SINGLE(c)) { 948 return {c, 1, p, p0}; 949 } 950 // U8_IS_TRAIL(c) if well-formed 951 c &= 0x3f; 952 uint8_t count = 1; 953 for (uint8_t shift = 6;;) { 954 uint8_t b = *--p; 955 if (b >= 0xc0) { 956 U8_MASK_LEAD_BYTE(b, count); 957 c |= uint32_t{b} << shift; 958 break; 959 } else { 960 c |= (uint32_t{b} & 0x3f) << shift; 961 ++count; 962 shift += 6; 963 } 964 } 965 ++count; 966 return {c, count, p, p0}; 967 } 968 }; 969 970 // UTF-16 971 template<typename CP32, typename UnitIter> 972 class UnsafeUTFImpl< 973 CP32, 974 UnitIter, 975 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> { 976 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 977 public: 978 U_FORCE_INLINE static void inc(UnitIter &p) { 979 // Very similar to U16_FWD_1_UNSAFE(). 980 auto c = *p; 981 ++p; 982 if (U16_IS_LEAD(c)) { 983 ++p; 984 } 985 } 986 987 U_FORCE_INLINE static void dec(UnitIter &p) { 988 // Very similar to U16_BACK_1_UNSAFE(). 989 if (U16_IS_TRAIL(*--p)) { 990 --p; 991 } 992 } 993 994 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { 995 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 996 // Very similar to U16_NEXT_UNSAFE(). 997 CP32 c = static_cast<CP32>(*p); 998 ++p; 999 if (!U16_IS_LEAD(c)) { 1000 if constexpr (isMultiPass) { 1001 return {c, 1, p0, p}; 1002 } else { 1003 return {c, 1}; 1004 } 1005 } else { 1006 uint16_t c2 = *p; 1007 ++p; 1008 c = U16_GET_SUPPLEMENTARY(c, c2); 1009 if constexpr (isMultiPass) { 1010 return {c, 2, p0, p}; 1011 } else { 1012 return {c, 2}; 1013 } 1014 } 1015 } 1016 1017 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { 1018 // Very similar to U16_PREV_UNSAFE(). 1019 UnitIter p0 = p; 1020 CP32 c = static_cast<CP32>(*--p); 1021 if (!U16_IS_TRAIL(c)) { 1022 return {c, 1, p, p0}; 1023 } else { 1024 uint16_t c2 = *--p; 1025 c = U16_GET_SUPPLEMENTARY(c2, c); 1026 return {c, 2, p, p0}; 1027 } 1028 } 1029 }; 1030 1031 // UTF-32: trivial 1032 template<typename CP32, typename UnitIter> 1033 class UnsafeUTFImpl< 1034 CP32, 1035 UnitIter, 1036 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> { 1037 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1038 public: 1039 U_FORCE_INLINE static void inc(UnitIter &p) { 1040 ++p; 1041 } 1042 1043 U_FORCE_INLINE static void dec(UnitIter &p) { 1044 --p; 1045 } 1046 1047 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { 1048 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; 1049 CP32 c = *p; 1050 ++p; 1051 if constexpr (isMultiPass) { 1052 return {c, 1, p0, p}; 1053 } else { 1054 return {c, 1}; 1055 } 1056 } 1057 1058 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { 1059 UnitIter p0 = p; 1060 CP32 c = *--p; 1061 return {c, 1, p, p0}; 1062 } 1063 }; 1064 1065 #endif 1066 1067 // Validating iterators ---------------------------------------------------- *** 1068 1069 /** 1070 * Validating iterator over the code points in a Unicode string. 1071 * 1072 * The UnitIter can be 1073 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer). 1074 * The UTFIterator will have the corresponding iterator_category. 1075 * 1076 * Call utfIterator() to have the compiler deduce the UnitIter and LimitIter types. 1077 * 1078 * For reverse iteration, either use this iterator directly as in <code>*--iter</code> 1079 * or wrap it using std::make_reverse_iterator(iter). 1080 * 1081 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; 1082 * should be signed if UTF_BEHAVIOR_NEGATIVE 1083 * @tparam behavior How to handle ill-formed Unicode strings 1084 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: 1085 * UTF-8: char or char8_t or uint8_t; 1086 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 1087 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 1088 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. 1089 * @draft ICU 78 1090 * @see utfIterator 1091 */ 1092 template<typename CP32, UTFIllFormedBehavior behavior, 1093 typename UnitIter, typename LimitIter = UnitIter, typename = void> 1094 class UTFIterator { 1095 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1096 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>; 1097 1098 // Proxy type for operator->() (required by LegacyInputIterator) 1099 // so that we don't promise always returning CodeUnits. 1100 class Proxy { 1101 public: 1102 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {} 1103 CodeUnits<CP32, UnitIter> &operator*() { return units_; } 1104 CodeUnits<CP32, UnitIter> *operator->() { return &units_; } 1105 private: 1106 CodeUnits<CP32, UnitIter> units_; 1107 }; 1108 1109 public: 1110 /** C++ iterator boilerplate @internal */ 1111 using value_type = CodeUnits<CP32, UnitIter>; 1112 /** C++ iterator boilerplate @internal */ 1113 using reference = value_type; 1114 /** C++ iterator boilerplate @internal */ 1115 using pointer = Proxy; 1116 /** C++ iterator boilerplate @internal */ 1117 using difference_type = prv::iter_difference_t<UnitIter>; 1118 /** C++ iterator boilerplate @internal */ 1119 using iterator_category = std::conditional_t< 1120 prv::bidirectional_iterator<UnitIter>, 1121 std::bidirectional_iterator_tag, 1122 std::forward_iterator_tag>; 1123 1124 /** 1125 * Constructor with start <= p < limit. 1126 * All of these iterators/pointers should be at code point boundaries. 1127 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. 1128 * 1129 * When using a code unit sentinel (UnitIter≠LimitIter), 1130 * then that sentinel also works as a sentinel for this code point iterator. 1131 * 1132 * @param start Start of the range 1133 * @param p Initial position inside the range 1134 * @param limit Limit (exclusive end) of the range 1135 * @draft ICU 78 1136 */ 1137 U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) : 1138 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {} 1139 /** 1140 * Constructor with start == p < limit. 1141 * All of these iterators/pointers should be at code point boundaries. 1142 * 1143 * When using a code unit sentinel (UnitIter≠LimitIter), 1144 * then that sentinel also works as a sentinel for this code point iterator. 1145 * 1146 * @param p Start of the range, and the initial position 1147 * @param limit Limit (exclusive end) of the range 1148 * @draft ICU 78 1149 */ 1150 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : 1151 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {} 1152 /** 1153 * Constructs an iterator start or limit sentinel. 1154 * The iterator/pointer should be at a code point boundary. 1155 * Requires UnitIter to be copyable. 1156 * 1157 * When using a code unit sentinel (UnitIter≠LimitIter), 1158 * then that sentinel also works as a sentinel for this code point iterator. 1159 * 1160 * @param p Range start or limit 1161 * @draft ICU 78 1162 */ 1163 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {} 1164 /** 1165 * Default constructor. Makes a non-functional iterator. 1166 * 1167 * @draft ICU 78 1168 */ 1169 U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {} 1170 1171 /** Move constructor. @draft ICU 78 */ 1172 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default; 1173 /** Move assignment operator. @draft ICU 78 */ 1174 U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default; 1175 1176 /** Copy constructor. @draft ICU 78 */ 1177 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default; 1178 /** Copy assignment operator. @draft ICU 78 */ 1179 U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default; 1180 1181 /** 1182 * @param other Another iterator 1183 * @return true if this iterator is at the same position as the other one 1184 * @draft ICU 78 1185 */ 1186 U_FORCE_INLINE bool operator==(const UTFIterator &other) const { 1187 return getLogicalPosition() == other.getLogicalPosition(); 1188 } 1189 /** 1190 * @param other Another iterator 1191 * @return true if this iterator is not at the same position as the other one 1192 * @draft ICU 78 1193 */ 1194 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); } 1195 1196 // Asymmetric equality & nonequality with a sentinel type. 1197 1198 /** 1199 * @param iter A UTFIterator 1200 * @param s A unit iterator sentinel 1201 * @return true if the iterator’s position is equal to the sentinel 1202 * @draft ICU 78 1203 */ 1204 template<typename Sentinel> U_FORCE_INLINE friend 1205 std::enable_if_t< 1206 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1207 bool> 1208 operator==(const UTFIterator &iter, const Sentinel &s) { 1209 return iter.getLogicalPosition() == s; 1210 } 1211 1212 #if U_CPLUSPLUS_VERSION < 20 1213 // C++17: Need to define all four combinations of == / != vs. parameter order. 1214 // Once we require C++20, we could remove all but the first == because 1215 // the compiler would generate the rest. 1216 1217 /** 1218 * @param s A unit iterator sentinel 1219 * @param iter A UTFIterator 1220 * @return true if the iterator’s position is equal to the sentinel 1221 * @internal 1222 */ 1223 template<typename Sentinel> U_FORCE_INLINE friend 1224 std::enable_if_t< 1225 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1226 bool> 1227 operator==(const Sentinel &s, const UTFIterator &iter) { 1228 return iter.getLogicalPosition() == s; 1229 } 1230 /** 1231 * @param iter A UTFIterator 1232 * @param s A unit iterator sentinel 1233 * @return true if the iterator’s position is not equal to the sentinel 1234 * @internal 1235 */ 1236 template<typename Sentinel> U_FORCE_INLINE friend 1237 std::enable_if_t< 1238 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1239 bool> 1240 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); } 1241 /** 1242 * @param s A unit iterator sentinel 1243 * @param iter A UTFIterator 1244 * @return true if the iterator’s position is not equal to the sentinel 1245 * @internal 1246 */ 1247 template<typename Sentinel> U_FORCE_INLINE friend 1248 std::enable_if_t< 1249 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1250 bool> 1251 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); } 1252 #endif // C++17 1253 1254 /** 1255 * Decodes the code unit sequence at the current position. 1256 * 1257 * @return CodeUnits with the decoded code point etc. 1258 * @draft ICU 78 1259 */ 1260 U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const { 1261 if (state_ == 0) { 1262 UnitIter p0 = p_; 1263 units_ = Impl::readAndInc(p0, p_, limit_); 1264 state_ = 1; 1265 } 1266 return units_; 1267 } 1268 1269 /** 1270 * Decodes the code unit sequence at the current position. 1271 * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc. 1272 * 1273 * @return CodeUnits with the decoded code point etc., wrapped into 1274 * an opaque proxy object so that <code>iter->codePoint()</code> etc. works. 1275 * @draft ICU 78 1276 */ 1277 U_FORCE_INLINE Proxy operator->() const { 1278 if (state_ == 0) { 1279 UnitIter p0 = p_; 1280 units_ = Impl::readAndInc(p0, p_, limit_); 1281 state_ = 1; 1282 } 1283 return Proxy(units_); 1284 } 1285 1286 /** 1287 * Pre-increment operator. 1288 * 1289 * @return this iterator 1290 * @draft ICU 78 1291 */ 1292 U_FORCE_INLINE UTFIterator &operator++() { // pre-increment 1293 if (state_ > 0) { 1294 // operator*() called readAndInc() so p_ is already ahead. 1295 state_ = 0; 1296 } else if (state_ == 0) { 1297 Impl::inc(p_, limit_); 1298 } else /* state_ < 0 */ { 1299 // operator--() called decAndRead() so we know how far to skip. 1300 p_ = units_.end(); 1301 state_ = 0; 1302 } 1303 return *this; 1304 } 1305 1306 /** 1307 * Post-increment operator. 1308 * 1309 * @return a copy of this iterator from before the increment. 1310 * If UnitIter is a single-pass input_iterator, then this function 1311 * returns an opaque proxy object so that <code>*iter++</code> still works. 1312 * @draft ICU 78 1313 */ 1314 U_FORCE_INLINE UTFIterator operator++(int) { // post-increment 1315 if (state_ > 0) { 1316 // operator*() called readAndInc() so p_ is already ahead. 1317 UTFIterator result(*this); 1318 state_ = 0; 1319 return result; 1320 } else if (state_ == 0) { 1321 UnitIter p0 = p_; 1322 units_ = Impl::readAndInc(p0, p_, limit_); 1323 UTFIterator result(*this); 1324 result.state_ = 1; 1325 // keep this->state_ == 0 1326 return result; 1327 } else /* state_ < 0 */ { 1328 UTFIterator result(*this); 1329 // operator--() called decAndRead() so we know how far to skip. 1330 p_ = units_.end(); 1331 state_ = 0; 1332 return result; 1333 } 1334 } 1335 1336 /** 1337 * Pre-decrement operator. 1338 * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). 1339 * 1340 * @return this iterator 1341 * @draft ICU 78 1342 */ 1343 template<typename Iter = UnitIter> 1344 U_FORCE_INLINE 1345 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &> 1346 operator--() { // pre-decrement 1347 if (state_ > 0) { 1348 // operator*() called readAndInc() so p_ is ahead of the logical position. 1349 p_ = units_.begin(); 1350 } 1351 units_ = Impl::decAndRead(start_, p_); 1352 state_ = -1; 1353 return *this; 1354 } 1355 1356 /** 1357 * Post-decrement operator. 1358 * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). 1359 * 1360 * @return a copy of this iterator from before the decrement. 1361 * @draft ICU 78 1362 */ 1363 template<typename Iter = UnitIter> 1364 U_FORCE_INLINE 1365 std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator> 1366 operator--(int) { // post-decrement 1367 UTFIterator result(*this); 1368 operator--(); 1369 return result; 1370 } 1371 1372 private: 1373 friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>; 1374 1375 U_FORCE_INLINE UnitIter getLogicalPosition() const { 1376 return state_ <= 0 ? p_ : units_.begin(); 1377 } 1378 1379 // operator*() etc. are logically const. 1380 mutable UnitIter p_; 1381 // In a validating iterator, we need start_ & limit_ so that when we read a code point 1382 // (forward or backward) we can test if there are enough code units. 1383 UnitIter start_; 1384 LimitIter limit_; 1385 // Keep state so that we call readAndInc() only once for both operator*() and ++ 1386 // to make it easy for the compiler to optimize. 1387 mutable CodeUnits<CP32, UnitIter> units_; 1388 // >0: units_ = readAndInc(), p_ = units limit 1389 // which means that p_ is ahead of its logical position 1390 // 0: initial state 1391 // <0: units_ = decAndRead(), p_ = units start 1392 mutable int8_t state_ = 0; 1393 }; 1394 1395 #ifndef U_IN_DOXYGEN 1396 // Partial template specialization for single-pass input iterator. 1397 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> 1398 class UTFIterator< 1399 CP32, behavior, 1400 UnitIter, LimitIter, 1401 std::enable_if_t<!prv::forward_iterator<UnitIter>>> { 1402 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1403 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>; 1404 1405 // Proxy type for post-increment return value, to make *iter++ work. 1406 // Also for operator->() (required by LegacyInputIterator) 1407 // so that we don't promise always returning CodeUnits. 1408 class Proxy { 1409 public: 1410 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {} 1411 CodeUnits<CP32, UnitIter> &operator*() { return units_; } 1412 CodeUnits<CP32, UnitIter> *operator->() { return &units_; } 1413 private: 1414 CodeUnits<CP32, UnitIter> units_; 1415 }; 1416 1417 public: 1418 using value_type = CodeUnits<CP32, UnitIter>; 1419 using reference = value_type; 1420 using pointer = Proxy; 1421 using difference_type = prv::iter_difference_t<UnitIter>; 1422 using iterator_category = std::input_iterator_tag; 1423 1424 U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {} 1425 1426 // Constructs an iterator start or limit sentinel. 1427 // Requires p to be copyable. 1428 U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {} 1429 1430 U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default; 1431 U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default; 1432 1433 U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default; 1434 U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default; 1435 1436 U_FORCE_INLINE bool operator==(const UTFIterator &other) const { 1437 return p_ == other.p_ && ahead_ == other.ahead_; 1438 // Strictly speaking, we should check if the logical position is the same. 1439 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter. 1440 } 1441 U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); } 1442 1443 template<typename Sentinel> U_FORCE_INLINE friend 1444 std::enable_if_t< 1445 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1446 bool> 1447 operator==(const UTFIterator &iter, const Sentinel &s) { 1448 return !iter.ahead_ && iter.p_ == s; 1449 } 1450 1451 #if U_CPLUSPLUS_VERSION < 20 1452 template<typename Sentinel> U_FORCE_INLINE friend 1453 std::enable_if_t< 1454 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1455 bool> 1456 operator==(const Sentinel &s, const UTFIterator &iter) { 1457 return !iter.ahead_ && iter.p_ == s; 1458 } 1459 1460 template<typename Sentinel> U_FORCE_INLINE friend 1461 std::enable_if_t< 1462 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1463 bool> 1464 operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); } 1465 1466 template<typename Sentinel> U_FORCE_INLINE friend 1467 std::enable_if_t< 1468 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 1469 bool> 1470 operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); } 1471 #endif // C++17 1472 1473 U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const { 1474 if (!ahead_) { 1475 units_ = Impl::readAndInc(p_, p_, limit_); 1476 ahead_ = true; 1477 } 1478 return units_; 1479 } 1480 1481 U_FORCE_INLINE Proxy operator->() const { 1482 if (!ahead_) { 1483 units_ = Impl::readAndInc(p_, p_, limit_); 1484 ahead_ = true; 1485 } 1486 return Proxy(units_); 1487 } 1488 1489 U_FORCE_INLINE UTFIterator &operator++() { // pre-increment 1490 if (ahead_) { 1491 // operator*() called readAndInc() so p_ is already ahead. 1492 ahead_ = false; 1493 } else { 1494 Impl::inc(p_, limit_); 1495 } 1496 return *this; 1497 } 1498 1499 U_FORCE_INLINE Proxy operator++(int) { // post-increment 1500 if (ahead_) { 1501 // operator*() called readAndInc() so p_ is already ahead. 1502 ahead_ = false; 1503 } else { 1504 units_ = Impl::readAndInc(p_, p_, limit_); 1505 // keep this->ahead_ == false 1506 } 1507 return Proxy(units_); 1508 } 1509 1510 private: 1511 // operator*() etc. are logically const. 1512 mutable UnitIter p_; 1513 // In a validating iterator, we need limit_ so that when we read a code point 1514 // we can test if there are enough code units. 1515 LimitIter limit_; 1516 // Keep state so that we call readAndInc() only once for both operator*() and ++ 1517 // so that we can use a single-pass input iterator for UnitIter. 1518 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false}; 1519 // true: units_ = readAndInc(), p_ = units limit 1520 // which means that p_ is ahead of its logical position 1521 // false: initial state 1522 mutable bool ahead_ = false; 1523 }; 1524 #endif // U_IN_DOXYGEN 1525 1526 } // namespace U_HEADER_ONLY_NAMESPACE 1527 1528 #ifndef U_IN_DOXYGEN 1529 // Bespoke specialization of reverse_iterator. 1530 // The default implementation implements reverse operator*() and ++ in a way 1531 // that does most of the same work twice for reading variable-length sequences. 1532 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter> 1533 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> { 1534 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1535 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>; 1536 using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>; 1537 1538 // Proxy type for operator->() (required by LegacyInputIterator) 1539 // so that we don't promise always returning CodeUnits. 1540 class Proxy { 1541 public: 1542 explicit Proxy(CodeUnits_ units) : units_(units) {} 1543 CodeUnits_ &operator*() { return units_; } 1544 CodeUnits_ *operator->() { return &units_; } 1545 private: 1546 CodeUnits_ units_; 1547 }; 1548 1549 public: 1550 using value_type = CodeUnits_; 1551 using reference = value_type; 1552 using pointer = Proxy; 1553 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>; 1554 using iterator_category = std::bidirectional_iterator_tag; 1555 1556 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) : 1557 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_), 1558 units_(0, 0, false, p_, p_) {} 1559 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {} 1560 1561 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default; 1562 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default; 1563 1564 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default; 1565 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default; 1566 1567 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const { 1568 return getLogicalPosition() == other.getLogicalPosition(); 1569 } 1570 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); } 1571 1572 U_FORCE_INLINE CodeUnits_ operator*() const { 1573 if (state_ == 0) { 1574 units_ = Impl::decAndRead(start_, p_); 1575 state_ = -1; 1576 } 1577 return units_; 1578 } 1579 1580 U_FORCE_INLINE Proxy operator->() const { 1581 if (state_ == 0) { 1582 units_ = Impl::decAndRead(start_, p_); 1583 state_ = -1; 1584 } 1585 return Proxy(units_); 1586 } 1587 1588 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment 1589 if (state_ < 0) { 1590 // operator*() called decAndRead() so p_ is already behind. 1591 state_ = 0; 1592 } else if (state_ == 0) { 1593 Impl::dec(start_, p_); 1594 } else /* state_ > 0 */ { 1595 // operator--() called readAndInc() so we know how far to skip. 1596 p_ = units_.begin(); 1597 state_ = 0; 1598 } 1599 return *this; 1600 } 1601 1602 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment 1603 if (state_ < 0) { 1604 // operator*() called decAndRead() so p_ is already behind. 1605 reverse_iterator result(*this); 1606 state_ = 0; 1607 return result; 1608 } else if (state_ == 0) { 1609 units_ = Impl::decAndRead(start_, p_); 1610 reverse_iterator result(*this); 1611 result.state_ = -1; 1612 // keep this->state_ == 0 1613 return result; 1614 } else /* state_ > 0 */ { 1615 reverse_iterator result(*this); 1616 // operator--() called readAndInc() so we know how far to skip. 1617 p_ = units_.begin(); 1618 state_ = 0; 1619 return result; 1620 } 1621 } 1622 1623 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement 1624 if (state_ < 0) { 1625 // operator*() called decAndRead() so p_ is behind the logical position. 1626 p_ = units_.end(); 1627 } 1628 UnitIter p0 = p_; 1629 units_ = Impl::readAndInc(p0, p_, limit_); 1630 state_ = 1; 1631 return *this; 1632 } 1633 1634 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement 1635 reverse_iterator result(*this); 1636 operator--(); 1637 return result; 1638 } 1639 1640 private: 1641 U_FORCE_INLINE UnitIter getLogicalPosition() const { 1642 return state_ >= 0 ? p_ : units_.end(); 1643 } 1644 1645 // operator*() etc. are logically const. 1646 mutable UnitIter p_; 1647 // In a validating iterator, we need start_ & limit_ so that when we read a code point 1648 // (forward or backward) we can test if there are enough code units. 1649 UnitIter start_; 1650 UnitIter limit_; 1651 // Keep state so that we call decAndRead() only once for both operator*() and ++ 1652 // to make it easy for the compiler to optimize. 1653 mutable CodeUnits_ units_; 1654 // >0: units_ = readAndInc(), p_ = units limit 1655 // 0: initial state 1656 // <0: units_ = decAndRead(), p_ = units start 1657 // which means that p_ is behind its logical position 1658 mutable int8_t state_ = 0; 1659 }; 1660 #endif // U_IN_DOXYGEN 1661 1662 namespace U_HEADER_ONLY_NAMESPACE { 1663 1664 /** 1665 * UTFIterator factory function for start <= p < limit. 1666 * Deduces the UnitIter and LimitIter template parameters from the inputs. 1667 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. 1668 * 1669 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 1670 * @tparam behavior How to handle ill-formed Unicode strings 1671 * @tparam UnitIter Can usually be omitted/deduced: 1672 * An iterator (often a pointer) that returns a code unit type: 1673 * UTF-8: char or char8_t or uint8_t; 1674 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 1675 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 1676 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. 1677 * @param start start code unit iterator 1678 * @param p current-position code unit iterator 1679 * @param limit limit (exclusive-end) code unit iterator. 1680 * When using a code unit sentinel (UnitIter≠LimitIter), 1681 * then that sentinel also works as a sentinel for the code point iterator. 1682 * @return a UTFIterator<CP32, behavior, UnitIter> 1683 * for the given code unit iterators or character pointers 1684 * @draft ICU 78 1685 */ 1686 template<typename CP32, UTFIllFormedBehavior behavior, 1687 typename UnitIter, typename LimitIter = UnitIter> 1688 auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) { 1689 return UTFIterator<CP32, behavior, UnitIter, LimitIter>( 1690 std::move(start), std::move(p), std::move(limit)); 1691 } 1692 1693 /** 1694 * UTFIterator factory function for start = p < limit. 1695 * Deduces the UnitIter and LimitIter template parameters from the inputs. 1696 * 1697 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 1698 * @tparam behavior How to handle ill-formed Unicode strings 1699 * @tparam UnitIter Can usually be omitted/deduced: 1700 * An iterator (often a pointer) that returns a code unit type: 1701 * UTF-8: char or char8_t or uint8_t; 1702 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 1703 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 1704 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. 1705 * @param p start and current-position code unit iterator 1706 * @param limit limit (exclusive-end) code unit iterator. 1707 * When using a code unit sentinel (UnitIter≠LimitIter), 1708 * then that sentinel also works as a sentinel for the code point iterator. 1709 * @return a UTFIterator<CP32, behavior, UnitIter> 1710 * for the given code unit iterators or character pointers 1711 * @draft ICU 78 1712 */ 1713 template<typename CP32, UTFIllFormedBehavior behavior, 1714 typename UnitIter, typename LimitIter = UnitIter> 1715 auto utfIterator(UnitIter p, LimitIter limit) { 1716 return UTFIterator<CP32, behavior, UnitIter, LimitIter>( 1717 std::move(p), std::move(limit)); 1718 } 1719 1720 // Note: We should only enable the following factory function for a copyable UnitIter. 1721 // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator, 1722 // but a function template partial specialization is not allowed. 1723 // In C++20, we might be able to require the std::copyable concept. 1724 1725 /** 1726 * UTFIterator factory function for a start or limit sentinel. 1727 * Deduces the UnitIter template parameter from the input. 1728 * Requires UnitIter to be copyable. 1729 * 1730 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 1731 * @tparam behavior How to handle ill-formed Unicode strings 1732 * @tparam UnitIter Can usually be omitted/deduced: 1733 * An iterator (often a pointer) that returns a code unit type: 1734 * UTF-8: char or char8_t or uint8_t; 1735 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 1736 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 1737 * @param p code unit iterator. 1738 * When using a code unit sentinel, 1739 * then that sentinel also works as a sentinel for the code point iterator. 1740 * @return a UTFIterator<CP32, behavior, UnitIter> 1741 * for the given code unit iterator or character pointer 1742 * @draft ICU 78 1743 */ 1744 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter> 1745 auto utfIterator(UnitIter p) { 1746 return UTFIterator<CP32, behavior, UnitIter>(std::move(p)); 1747 } 1748 1749 /** 1750 * A C++ "range" for validating iteration over all of the code points of a code unit range. 1751 * 1752 * Call utfStringCodePoints() to have the compiler deduce the Range type. 1753 * 1754 * UTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range 1755 * so is UTFStringCodePoints<CP32, behavior, Range>. 1756 * Note that when given a range r that is an lvalue and is not a view, utfStringCodePoints(r) uses a 1757 * ref_view of r as the Range type, which is a borrowed range. 1758 * In practice, this means that given a container variable r, the iterators of utfStringCodePoints(r) can 1759 * be used as long as iterators on r are valid, without having to keep utfStringCodePoints(r) around. 1760 * For instance: 1761 * \code 1762 * std::u8string s = "𒇧𒇧"; 1763 * // it outlives utfStringCodePoints<char32_t>(s). 1764 * auto it = utfStringCodePoints<char32_t>(s).begin(); 1765 * ++it; 1766 * char32_t second_code_point = it->codePoint(); // OK. 1767 * \endcode 1768 * 1769 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; 1770 * should be signed if UTF_BEHAVIOR_NEGATIVE 1771 * @tparam behavior How to handle ill-formed Unicode strings 1772 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units 1773 * @draft ICU 78 1774 * @see utfStringCodePoints 1775 */ 1776 template<typename CP32, UTFIllFormedBehavior behavior, typename Range> 1777 class UTFStringCodePoints { 1778 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1779 public: 1780 /** 1781 * Constructs an empty C++ "range" object. 1782 * @draft ICU 78 1783 */ 1784 UTFStringCodePoints() = default; 1785 1786 /** 1787 * Constructs a C++ "range" object over the code points in the string. 1788 * @param unitRange input range 1789 * @draft ICU 78 1790 */ 1791 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>> 1792 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {} 1793 /** 1794 * Constructs a C++ "range" object over the code points in the string, 1795 * keeping a reference to the code unit range. This overload is used by 1796 * utfStringCodePoints in C++17; in C+20, a ref_view is used instead (via 1797 * views::all). 1798 * @param unitRange input range 1799 * @draft ICU 78 1800 */ 1801 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void> 1802 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {} 1803 1804 /** Copy constructor. @draft ICU 78 */ 1805 UTFStringCodePoints(const UTFStringCodePoints &other) = default; 1806 1807 /** Copy assignment operator. @draft ICU 78 */ 1808 UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default; 1809 1810 /** 1811 * @return the range start iterator 1812 * @draft ICU 78 1813 */ 1814 auto begin() { 1815 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end()); 1816 } 1817 1818 /** 1819 * @return the range start iterator 1820 * @draft ICU 78 1821 */ 1822 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> 1823 auto begin() const { 1824 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end()); 1825 } 1826 1827 /** 1828 * @return the range limit (exclusive end) iterator 1829 * @draft ICU 78 1830 */ 1831 auto end() { 1832 using UnitIter = decltype(unitRange.begin()); 1833 using LimitIter = decltype(unitRange.end()); 1834 if constexpr (!std::is_same_v<UnitIter, LimitIter>) { 1835 // Return the code unit sentinel. 1836 return unitRange.end(); 1837 } else if constexpr (prv::bidirectional_iterator<UnitIter>) { 1838 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end()); 1839 } else { 1840 // The input iterator specialization has no three-argument constructor. 1841 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end()); 1842 } 1843 } 1844 1845 /** 1846 * @return the range limit (exclusive end) iterator 1847 * @draft ICU 78 1848 */ 1849 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> 1850 auto end() const { 1851 using UnitIter = decltype(unitRange.begin()); 1852 using LimitIter = decltype(unitRange.end()); 1853 if constexpr (!std::is_same_v<UnitIter, LimitIter>) { 1854 // Return the code unit sentinel. 1855 return unitRange.end(); 1856 } else if constexpr (prv::bidirectional_iterator<UnitIter>) { 1857 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end()); 1858 } else { 1859 // The input iterator specialization has no three-argument constructor. 1860 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end()); 1861 } 1862 } 1863 1864 /** 1865 * @return std::reverse_iterator(end()) 1866 * @draft ICU 78 1867 */ 1868 auto rbegin() const { 1869 return std::make_reverse_iterator(end()); 1870 } 1871 1872 /** 1873 * @return std::reverse_iterator(begin()) 1874 * @draft ICU 78 1875 */ 1876 auto rend() const { 1877 return std::make_reverse_iterator(begin()); 1878 } 1879 1880 private: 1881 Range unitRange; 1882 }; 1883 1884 /** @internal */ 1885 template<typename CP32, UTFIllFormedBehavior behavior> 1886 struct UTFStringCodePointsAdaptor 1887 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \ 1888 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3. 1889 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>> 1890 #endif 1891 { 1892 /** @internal */ 1893 template<typename Range> 1894 auto operator()(Range &&unitRange) const { 1895 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2. 1896 return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>( 1897 std::forward<Range>(unitRange)); 1898 #else 1899 if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) { 1900 // Take basic_string_view by copy, not by reference. In C++20 this is handled by 1901 // all_t<Range>, which is Range if Range is a view. 1902 return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>( 1903 std::forward<Range>(unitRange)); 1904 } else { 1905 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange)); 1906 } 1907 #endif 1908 } 1909 }; 1910 1911 /** 1912 * Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code 1913 * points in a code unit range, which validates while decoding. 1914 * Deduces the Range template parameter from the input, taking into account the value category: the 1915 * code units will be referenced if possible, and moved if necessary. 1916 * 1917 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; 1918 * should be signed if UTF_BEHAVIOR_NEGATIVE 1919 * @tparam behavior How to handle ill-formed Unicode strings 1920 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units 1921 * @param unitRange input range 1922 * @return a UTFStringCodePoints<CP32, behavior, Range> for the given unitRange 1923 * @draft ICU 78 1924 */ 1925 template<typename CP32, UTFIllFormedBehavior behavior> 1926 constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints; 1927 1928 // Non-validating iterators ------------------------------------------------ *** 1929 1930 /** 1931 * Non-validating iterator over the code points in a Unicode string. 1932 * The string must be well-formed. 1933 * 1934 * The UnitIter can be 1935 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer). 1936 * The UTFIterator will have the corresponding iterator_category. 1937 * 1938 * Call unsafeUTFIterator() to have the compiler deduce the UnitIter type. 1939 * 1940 * For reverse iteration, either use this iterator directly as in <code>*--iter</code> 1941 * or wrap it using std::make_reverse_iterator(iter). 1942 * 1943 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 1944 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: 1945 * UTF-8: char or char8_t or uint8_t; 1946 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 1947 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 1948 * @draft ICU 78 1949 * @see unsafeUTFIterator 1950 */ 1951 template<typename CP32, typename UnitIter, typename = void> 1952 class UnsafeUTFIterator { 1953 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 1954 using Impl = UnsafeUTFImpl<CP32, UnitIter>; 1955 1956 // Proxy type for operator->() (required by LegacyInputIterator) 1957 // so that we don't promise always returning UnsafeCodeUnits. 1958 class Proxy { 1959 public: 1960 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {} 1961 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; } 1962 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; } 1963 private: 1964 UnsafeCodeUnits<CP32, UnitIter> units_; 1965 }; 1966 1967 public: 1968 /** C++ iterator boilerplate @internal */ 1969 using value_type = UnsafeCodeUnits<CP32, UnitIter>; 1970 /** C++ iterator boilerplate @internal */ 1971 using reference = value_type; 1972 /** C++ iterator boilerplate @internal */ 1973 using pointer = Proxy; 1974 /** C++ iterator boilerplate @internal */ 1975 using difference_type = prv::iter_difference_t<UnitIter>; 1976 /** C++ iterator boilerplate @internal */ 1977 using iterator_category = std::conditional_t< 1978 prv::bidirectional_iterator<UnitIter>, 1979 std::bidirectional_iterator_tag, 1980 std::forward_iterator_tag>; 1981 1982 /** 1983 * Constructor; the iterator/pointer should be at a code point boundary. 1984 * 1985 * When using a code unit sentinel, 1986 * then that sentinel also works as a sentinel for this code point iterator. 1987 * 1988 * @param p Initial position inside the range, or a range sentinel 1989 * @draft ICU 78 1990 */ 1991 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {} 1992 /** 1993 * Default constructor. Makes a non-functional iterator. 1994 * 1995 * @draft ICU 78 1996 */ 1997 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {} 1998 1999 /** Move constructor. @draft ICU 78 */ 2000 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default; 2001 /** Move assignment operator. @draft ICU 78 */ 2002 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default; 2003 2004 /** Copy constructor. @draft ICU 78 */ 2005 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default; 2006 /** Copy assignment operator. @draft ICU 78 */ 2007 U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default; 2008 2009 /** 2010 * @param other Another iterator 2011 * @return true if this iterator is at the same position as the other one 2012 * @draft ICU 78 2013 */ 2014 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const { 2015 return getLogicalPosition() == other.getLogicalPosition(); 2016 } 2017 /** 2018 * @param other Another iterator 2019 * @return true if this iterator is not at the same position as the other one 2020 * @draft ICU 78 2021 */ 2022 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); } 2023 2024 /** 2025 * @param iter An UnsafeUTFIterator 2026 * @param s A unit iterator sentinel 2027 * @return true if the iterator’s position is equal to the sentinel 2028 * @draft ICU 78 2029 */ 2030 template<typename Sentinel> U_FORCE_INLINE friend 2031 std::enable_if_t< 2032 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2033 bool> 2034 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) { 2035 return iter.getLogicalPosition() == s; 2036 } 2037 2038 #if U_CPLUSPLUS_VERSION < 20 2039 /** 2040 * @param s A unit iterator sentinel 2041 * @param iter An UnsafeUTFIterator 2042 * @return true if the iterator’s position is equal to the sentinel 2043 * @internal 2044 */ 2045 template<typename Sentinel> U_FORCE_INLINE friend 2046 std::enable_if_t< 2047 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2048 bool> 2049 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) { 2050 return iter.getLogicalPosition() == s; 2051 } 2052 /** 2053 * @param iter An UnsafeUTFIterator 2054 * @param s A unit iterator sentinel 2055 * @return true if the iterator’s position is not equal to the sentinel 2056 * @internal 2057 */ 2058 template<typename Sentinel> U_FORCE_INLINE friend 2059 std::enable_if_t< 2060 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2061 bool> 2062 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); } 2063 /** 2064 * @param s A unit iterator sentinel 2065 * @param iter An UnsafeUTFIterator 2066 * @return true if the iterator’s position is not equal to the sentinel 2067 * @internal 2068 */ 2069 template<typename Sentinel> U_FORCE_INLINE friend 2070 std::enable_if_t< 2071 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2072 bool> 2073 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); } 2074 #endif // C++17 2075 2076 /** 2077 * Decodes the code unit sequence at the current position. 2078 * 2079 * @return CodeUnits with the decoded code point etc. 2080 * @draft ICU 78 2081 */ 2082 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const { 2083 if (state_ == 0) { 2084 UnitIter p0 = p_; 2085 units_ = Impl::readAndInc(p0, p_); 2086 state_ = 1; 2087 } 2088 return units_; 2089 } 2090 2091 /** 2092 * Decodes the code unit sequence at the current position. 2093 * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc. 2094 * 2095 * @return CodeUnits with the decoded code point etc., wrapped into 2096 * an opaque proxy object so that <code>iter->codePoint()</code> etc. works. 2097 * @draft ICU 78 2098 */ 2099 U_FORCE_INLINE Proxy operator->() const { 2100 if (state_ == 0) { 2101 UnitIter p0 = p_; 2102 units_ = Impl::readAndInc(p0, p_); 2103 state_ = 1; 2104 } 2105 return Proxy(units_); 2106 } 2107 2108 /** 2109 * Pre-increment operator. 2110 * 2111 * @return this iterator 2112 * @draft ICU 78 2113 */ 2114 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment 2115 if (state_ > 0) { 2116 // operator*() called readAndInc() so p_ is already ahead. 2117 state_ = 0; 2118 } else if (state_ == 0) { 2119 Impl::inc(p_); 2120 } else /* state_ < 0 */ { 2121 // operator--() called decAndRead() so we know how far to skip. 2122 p_ = units_.end(); 2123 state_ = 0; 2124 } 2125 return *this; 2126 } 2127 2128 /** 2129 * Post-increment operator. 2130 * 2131 * @return a copy of this iterator from before the increment. 2132 * If UnitIter is a single-pass input_iterator, then this function 2133 * returns an opaque proxy object so that <code>*iter++</code> still works. 2134 * @draft ICU 78 2135 */ 2136 U_FORCE_INLINE UnsafeUTFIterator operator++(int) { // post-increment 2137 if (state_ > 0) { 2138 // operator*() called readAndInc() so p_ is already ahead. 2139 UnsafeUTFIterator result(*this); 2140 state_ = 0; 2141 return result; 2142 } else if (state_ == 0) { 2143 UnitIter p0 = p_; 2144 units_ = Impl::readAndInc(p0, p_); 2145 UnsafeUTFIterator result(*this); 2146 result.state_ = 1; 2147 // keep this->state_ == 0 2148 return result; 2149 } else /* state_ < 0 */ { 2150 UnsafeUTFIterator result(*this); 2151 // operator--() called decAndRead() so we know how far to skip. 2152 p_ = units_.end(); 2153 state_ = 0; 2154 return result; 2155 } 2156 } 2157 2158 /** 2159 * Pre-decrement operator. 2160 * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). 2161 * 2162 * @return this iterator 2163 * @draft ICU 78 2164 */ 2165 template<typename Iter = UnitIter> 2166 U_FORCE_INLINE 2167 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &> 2168 operator--() { // pre-decrement 2169 if (state_ > 0) { 2170 // operator*() called readAndInc() so p_ is ahead of the logical position. 2171 p_ = units_.begin(); 2172 } 2173 units_ = Impl::decAndRead(p_); 2174 state_ = -1; 2175 return *this; 2176 } 2177 2178 /** 2179 * Post-decrement operator. 2180 * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). 2181 * 2182 * @return a copy of this iterator from before the decrement. 2183 * @draft ICU 78 2184 */ 2185 template<typename Iter = UnitIter> 2186 U_FORCE_INLINE 2187 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator> 2188 operator--(int) { // post-decrement 2189 UnsafeUTFIterator result(*this); 2190 operator--(); 2191 return result; 2192 } 2193 2194 private: 2195 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>; 2196 2197 U_FORCE_INLINE UnitIter getLogicalPosition() const { 2198 return state_ <= 0 ? p_ : units_.begin(); 2199 } 2200 2201 // operator*() etc. are logically const. 2202 mutable UnitIter p_; 2203 // Keep state so that we call readAndInc() only once for both operator*() and ++ 2204 // to make it easy for the compiler to optimize. 2205 mutable UnsafeCodeUnits<CP32, UnitIter> units_; 2206 // >0: units_ = readAndInc(), p_ = units limit 2207 // which means that p_ is ahead of its logical position 2208 // 0: initial state 2209 // <0: units_ = decAndRead(), p_ = units start 2210 mutable int8_t state_ = 0; 2211 }; 2212 2213 #ifndef U_IN_DOXYGEN 2214 // Partial template specialization for single-pass input iterator. 2215 template<typename CP32, typename UnitIter> 2216 class UnsafeUTFIterator< 2217 CP32, 2218 UnitIter, 2219 std::enable_if_t<!prv::forward_iterator<UnitIter>>> { 2220 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 2221 using Impl = UnsafeUTFImpl<CP32, UnitIter>; 2222 2223 // Proxy type for post-increment return value, to make *iter++ work. 2224 // Also for operator->() (required by LegacyInputIterator) 2225 // so that we don't promise always returning UnsafeCodeUnits. 2226 class Proxy { 2227 public: 2228 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {} 2229 UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; } 2230 UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; } 2231 private: 2232 UnsafeCodeUnits<CP32, UnitIter> units_; 2233 }; 2234 2235 public: 2236 using value_type = UnsafeCodeUnits<CP32, UnitIter>; 2237 using reference = value_type; 2238 using pointer = Proxy; 2239 using difference_type = prv::iter_difference_t<UnitIter>; 2240 using iterator_category = std::input_iterator_tag; 2241 2242 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {} 2243 2244 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default; 2245 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default; 2246 2247 U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default; 2248 U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default; 2249 2250 U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const { 2251 return p_ == other.p_ && ahead_ == other.ahead_; 2252 // Strictly speaking, we should check if the logical position is the same. 2253 // However, we cannot advance, or do arithmetic with, a single-pass UnitIter. 2254 } 2255 U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); } 2256 2257 template<typename Sentinel> U_FORCE_INLINE friend 2258 std::enable_if_t< 2259 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2260 bool> 2261 operator==(const UnsafeUTFIterator &iter, const Sentinel &s) { 2262 return !iter.ahead_ && iter.p_ == s; 2263 } 2264 2265 #if U_CPLUSPLUS_VERSION < 20 2266 template<typename Sentinel> U_FORCE_INLINE friend 2267 std::enable_if_t< 2268 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2269 bool> 2270 operator==(const Sentinel &s, const UnsafeUTFIterator &iter) { 2271 return !iter.ahead_ && iter.p_ == s; 2272 } 2273 2274 template<typename Sentinel> U_FORCE_INLINE friend 2275 std::enable_if_t< 2276 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2277 bool> 2278 operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); } 2279 2280 template<typename Sentinel> U_FORCE_INLINE friend 2281 std::enable_if_t< 2282 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, 2283 bool> 2284 operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); } 2285 #endif // C++17 2286 2287 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const { 2288 if (!ahead_) { 2289 units_ = Impl::readAndInc(p_, p_); 2290 ahead_ = true; 2291 } 2292 return units_; 2293 } 2294 2295 U_FORCE_INLINE Proxy operator->() const { 2296 if (!ahead_) { 2297 units_ = Impl::readAndInc(p_, p_); 2298 ahead_ = true; 2299 } 2300 return Proxy(units_); 2301 } 2302 2303 U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment 2304 if (ahead_) { 2305 // operator*() called readAndInc() so p_ is already ahead. 2306 ahead_ = false; 2307 } else { 2308 Impl::inc(p_); 2309 } 2310 return *this; 2311 } 2312 2313 U_FORCE_INLINE Proxy operator++(int) { // post-increment 2314 if (ahead_) { 2315 // operator*() called readAndInc() so p_ is already ahead. 2316 ahead_ = false; 2317 } else { 2318 units_ = Impl::readAndInc(p_, p_); 2319 // keep this->ahead_ == false 2320 } 2321 return Proxy(units_); 2322 } 2323 2324 private: 2325 // operator*() etc. are logically const. 2326 mutable UnitIter p_; 2327 // Keep state so that we call readAndInc() only once for both operator*() and ++ 2328 // so that we can use a single-pass input iterator for UnitIter. 2329 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0}; 2330 // true: units_ = readAndInc(), p_ = units limit 2331 // which means that p_ is ahead of its logical position 2332 // false: initial state 2333 mutable bool ahead_ = false; 2334 }; 2335 #endif // U_IN_DOXYGEN 2336 2337 } // namespace U_HEADER_ONLY_NAMESPACE 2338 2339 #ifndef U_IN_DOXYGEN 2340 // Bespoke specialization of reverse_iterator. 2341 // The default implementation implements reverse operator*() and ++ in a way 2342 // that does most of the same work twice for reading variable-length sequences. 2343 template<typename CP32, typename UnitIter> 2344 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> { 2345 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 2346 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>; 2347 using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>; 2348 2349 // Proxy type for operator->() (required by LegacyInputIterator) 2350 // so that we don't promise always returning UnsafeCodeUnits. 2351 class Proxy { 2352 public: 2353 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {} 2354 UnsafeCodeUnits_ &operator*() { return units_; } 2355 UnsafeCodeUnits_ *operator->() { return &units_; } 2356 private: 2357 UnsafeCodeUnits_ units_; 2358 }; 2359 2360 public: 2361 using value_type = UnsafeCodeUnits_; 2362 using reference = value_type; 2363 using pointer = Proxy; 2364 using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>; 2365 using iterator_category = std::bidirectional_iterator_tag; 2366 2367 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) : 2368 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {} 2369 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {} 2370 2371 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default; 2372 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default; 2373 2374 U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default; 2375 U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default; 2376 2377 U_FORCE_INLINE bool operator==(const reverse_iterator &other) const { 2378 return getLogicalPosition() == other.getLogicalPosition(); 2379 } 2380 U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); } 2381 2382 U_FORCE_INLINE UnsafeCodeUnits_ operator*() const { 2383 if (state_ == 0) { 2384 units_ = Impl::decAndRead(p_); 2385 state_ = -1; 2386 } 2387 return units_; 2388 } 2389 2390 U_FORCE_INLINE Proxy operator->() const { 2391 if (state_ == 0) { 2392 units_ = Impl::decAndRead(p_); 2393 state_ = -1; 2394 } 2395 return Proxy(units_); 2396 } 2397 2398 U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment 2399 if (state_ < 0) { 2400 // operator*() called decAndRead() so p_ is already behind. 2401 state_ = 0; 2402 } else if (state_ == 0) { 2403 Impl::dec(p_); 2404 } else /* state_ > 0 */ { 2405 // operator--() called readAndInc() so we know how far to skip. 2406 p_ = units_.begin(); 2407 state_ = 0; 2408 } 2409 return *this; 2410 } 2411 2412 U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment 2413 if (state_ < 0) { 2414 // operator*() called decAndRead() so p_ is already behind. 2415 reverse_iterator result(*this); 2416 state_ = 0; 2417 return result; 2418 } else if (state_ == 0) { 2419 units_ = Impl::decAndRead(p_); 2420 reverse_iterator result(*this); 2421 result.state_ = -1; 2422 // keep this->state_ == 0 2423 return result; 2424 } else /* state_ > 0 */ { 2425 reverse_iterator result(*this); 2426 // operator--() called readAndInc() so we know how far to skip. 2427 p_ = units_.begin(); 2428 state_ = 0; 2429 return result; 2430 } 2431 } 2432 2433 U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement 2434 if (state_ < 0) { 2435 // operator*() called decAndRead() so p_ is behind the logical position. 2436 p_ = units_.end(); 2437 } 2438 UnitIter p0 = p_; 2439 units_ = Impl::readAndInc(p0, p_); 2440 state_ = 1; 2441 return *this; 2442 } 2443 2444 U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement 2445 reverse_iterator result(*this); 2446 operator--(); 2447 return result; 2448 } 2449 2450 private: 2451 U_FORCE_INLINE UnitIter getLogicalPosition() const { 2452 return state_ >= 0 ? p_ : units_.end(); 2453 } 2454 2455 // operator*() etc. are logically const. 2456 mutable UnitIter p_; 2457 // Keep state so that we call decAndRead() only once for both operator*() and ++ 2458 // to make it easy for the compiler to optimize. 2459 mutable UnsafeCodeUnits_ units_; 2460 // >0: units_ = readAndInc(), p_ = units limit 2461 // 0: initial state 2462 // <0: units_ = decAndRead(), p_ = units start 2463 // which means that p_ is behind its logical position 2464 mutable int8_t state_ = 0; 2465 }; 2466 #endif // U_IN_DOXYGEN 2467 2468 namespace U_HEADER_ONLY_NAMESPACE { 2469 2470 /** 2471 * UnsafeUTFIterator factory function. 2472 * Deduces the UnitIter template parameter from the input. 2473 * 2474 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 2475 * @tparam UnitIter Can usually be omitted/deduced: 2476 * An iterator (often a pointer) that returns a code unit type: 2477 * UTF-8: char or char8_t or uint8_t; 2478 * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; 2479 * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t 2480 * @param iter code unit iterator 2481 * @return an UnsafeUTFIterator<CP32, UnitIter> 2482 * for the given code unit iterator or character pointer 2483 * @draft ICU 78 2484 */ 2485 template<typename CP32, typename UnitIter> 2486 auto unsafeUTFIterator(UnitIter iter) { 2487 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter)); 2488 } 2489 2490 /** 2491 * A C++ "range" for non-validating iteration over all of the code points of a code unit range. 2492 * The string must be well-formed. 2493 * 2494 * Call unsafeUTFStringCodePoints() to have the compiler deduce the Range type. 2495 * 2496 * UnsafeUTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range 2497 * so is UnsafeUTFStringCodePoints<CP32, behavior, Range>. 2498 * Note that when given a range r that is an lvalue and is not a view, unsafeUTFStringCodePoints(r) uses 2499 * a ref_view of r as the Range type, which is a borrowed range. 2500 * In practice, this means that given a container variable r, the iterators of 2501 * unsafeUTFStringCodePoints(r) can be used as long as iterators on r are valid, without having to keep 2502 * unsafeUTFStringCodePoints(r) around. 2503 * For instance: 2504 * \code 2505 * std::u8string s = "𒇧𒇧"; 2506 * // it outlives unsafeUTFStringCodePoints<char32_t>(s). 2507 * auto it = unsafeUTFStringCodePoints<char32_t>(s).begin(); 2508 * ++it; 2509 * char32_t second_code_point = it->codePoint(); // OK. 2510 * \endcode 2511 * 2512 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 2513 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units 2514 * @draft ICU 78 2515 * @see unsafeUTFStringCodePoints 2516 */ 2517 template<typename CP32, typename Range> 2518 class UnsafeUTFStringCodePoints { 2519 static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); 2520 public: 2521 /** 2522 * Constructs an empty C++ "range" object. 2523 * @draft ICU 78 2524 */ 2525 UnsafeUTFStringCodePoints() = default; 2526 2527 /** 2528 * Constructs a C++ "range" object over the code points in the string. 2529 * @param unitRange input range 2530 * @draft ICU 78 2531 */ 2532 template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>> 2533 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {} 2534 /** 2535 * Constructs a C++ "range" object over the code points in the string, 2536 * keeping a reference to the code unit range. This overload is used by 2537 * utfStringCodePoints in C++17; in C++20, a ref_view is used instead (via 2538 * views::all). 2539 * @param unitRange input range 2540 * @draft ICU 78 2541 */ 2542 template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void> 2543 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {} 2544 2545 /** Copy constructor. @draft ICU 78 */ 2546 UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default; 2547 2548 /** Copy assignment operator. @draft ICU 78 */ 2549 UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default; 2550 2551 /** 2552 * @return the range start iterator 2553 * @draft ICU 78 2554 */ 2555 auto begin() { 2556 return unsafeUTFIterator<CP32>(unitRange.begin()); 2557 } 2558 2559 /** 2560 * @return the range start iterator 2561 * @draft ICU 78 2562 */ 2563 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> 2564 auto begin() const { 2565 return unsafeUTFIterator<CP32>(unitRange.begin()); 2566 } 2567 2568 /** 2569 * @return the range limit (exclusive end) iterator 2570 * @draft ICU 78 2571 */ 2572 auto end() { 2573 using UnitIter = decltype(unitRange.begin()); 2574 using LimitIter = decltype(unitRange.end()); 2575 if constexpr (!std::is_same_v<UnitIter, LimitIter>) { 2576 // Return the code unit sentinel. 2577 return unitRange.end(); 2578 } else { 2579 return unsafeUTFIterator<CP32>(unitRange.end()); 2580 } 2581 } 2582 2583 /** 2584 * @return the range limit (exclusive end) iterator 2585 * @draft ICU 78 2586 */ 2587 template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> 2588 auto end() const { 2589 using UnitIter = decltype(unitRange.begin()); 2590 using LimitIter = decltype(unitRange.end()); 2591 if constexpr (!std::is_same_v<UnitIter, LimitIter>) { 2592 // Return the code unit sentinel. 2593 return unitRange.end(); 2594 } else { 2595 return unsafeUTFIterator<CP32>(unitRange.end()); 2596 } 2597 } 2598 2599 /** 2600 * @return std::reverse_iterator(end()) 2601 * @draft ICU 78 2602 */ 2603 auto rbegin() const { 2604 return std::make_reverse_iterator(end()); 2605 } 2606 2607 /** 2608 * @return std::reverse_iterator(begin()) 2609 * @draft ICU 78 2610 */ 2611 auto rend() const { 2612 return std::make_reverse_iterator(begin()); 2613 } 2614 2615 private: 2616 Range unitRange; 2617 }; 2618 2619 /** @internal */ 2620 template<typename CP32> 2621 struct UnsafeUTFStringCodePointsAdaptor 2622 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \ 2623 __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3. 2624 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>> 2625 #endif 2626 { 2627 /** @internal */ 2628 template<typename Range> 2629 auto operator()(Range &&unitRange) const { 2630 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2. 2631 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange)); 2632 #else 2633 if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) { 2634 // Take basic_string_view by copy, not by reference. In C++20 this is handled by 2635 // all_t<Range>, which is Range if Range is a view. 2636 return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange)); 2637 } else { 2638 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange)); 2639 } 2640 #endif 2641 } 2642 }; 2643 2644 2645 /** 2646 * Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a 2647 * "range" of code points in a code unit range. The string must be well-formed. 2648 * Deduces the Range template parameter from the input, taking into account the value category: the 2649 * code units will be referenced if possible, and moved if necessary. 2650 * 2651 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t 2652 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units 2653 * @param unitRange input range 2654 * @return an UnsafeUTFStringCodePoints<CP32, Range> for the given unitRange 2655 * @draft ICU 78 2656 */ 2657 template<typename CP32> 2658 constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints; 2659 2660 } // namespace U_HEADER_ONLY_NAMESPACE 2661 2662 2663 #if defined(__cpp_lib_ranges) 2664 template <typename CP32, UTFIllFormedBehavior behavior, typename Range> 2665 constexpr bool std::ranges::enable_borrowed_range< 2666 U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> = 2667 std::ranges::enable_borrowed_range<Range>; 2668 2669 template <typename CP32, typename Range> 2670 constexpr bool std::ranges::enable_borrowed_range< 2671 U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> = 2672 std::ranges::enable_borrowed_range<Range>; 2673 #endif 2674 2675 #endif // U_HIDE_DRAFT_API 2676 #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API 2677 #endif // __UTFITERATOR_H__