tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

utfiterator.h (97060B)


      1 // © 2024 and later: Unicode, Inc. and others.
      2 // License & terms of use: https://www.unicode.org/copyright.html
      3 
      4 // utfiterator.h
      5 // created: 2024aug12 Markus W. Scherer
      6 
      7 #ifndef __UTFITERATOR_H__
      8 #define __UTFITERATOR_H__
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
     13 
     14 #include <iterator>
     15 #if defined(__cpp_lib_ranges)
     16 #include <ranges>
     17 #endif
     18 #include <string>
     19 #include <string_view>
     20 #include <type_traits>
     21 #include "unicode/utf16.h"
     22 #include "unicode/utf8.h"
     23 #include "unicode/uversion.h"
     24 
     25 /**
     26 * \file
     27 * \brief C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).
     28 *
     29 * Sample code:
     30 * \code
     31 * #include <string_view>
     32 * #include <iostream>
     33 * #include "unicode/utypes.h"
     34 * #include "unicode/utfiterator.h"
     35 *
     36 * using icu::header::utfIterator;
     37 * using icu::header::utfStringCodePoints;
     38 * using icu::header::unsafeUTFIterator;
     39 * using icu::header::unsafeUTFStringCodePoints;
     40 *
     41 * int32_t rangeLoop16(std::u16string_view s) {
     42 *     // We are just adding up the code points for minimal-code demonstration purposes.
     43 *     int32_t sum = 0;
     44 *     for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) {
     45 *         sum += units.codePoint();  // < 0 if ill-formed
     46 *     }
     47 *     return sum;
     48 * }
     49 *
     50 * int32_t loopIterPlusPlus16(std::u16string_view s) {
     51 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
     52 *     int32_t sum = 0;
     53 *     for (auto iter = range.begin(), limit = range.end(); iter != limit;) {
     54 *         sum += (*iter++).codePoint();  // U+FFFD if ill-formed
     55 *     }
     56 *     return sum;
     57 * }
     58 *
     59 * int32_t backwardLoop16(std::u16string_view s) {
     60 *     auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
     61 *     int32_t sum = 0;
     62 *     for (auto start = range.begin(), iter = range.end(); start != iter;) {
     63 *         sum += (*--iter).codePoint();  // surrogate code point if unpaired / ill-formed
     64 *     }
     65 *     return sum;
     66 * }
     67 *
     68 * int32_t reverseLoop8(std::string_view s) {
     69 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
     70 *     int32_t sum = 0;
     71 *     for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
     72 *         sum += iter->codePoint();  // U+FFFD if ill-formed
     73 *     }
     74 *     return sum;
     75 * }
     76 *
     77 * int32_t countCodePoints16(std::u16string_view s) {
     78 *     auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
     79 *     return std::distance(range.begin(), range.end());
     80 * }
     81 *
     82 * int32_t unsafeRangeLoop16(std::u16string_view s) {
     83 *     int32_t sum = 0;
     84 *     for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) {
     85 *         sum += units.codePoint();
     86 *     }
     87 *     return sum;
     88 * }
     89 *
     90 * int32_t unsafeReverseLoop8(std::string_view s) {
     91 *     auto range = unsafeUTFStringCodePoints<UChar32>(s);
     92 *     int32_t sum = 0;
     93 *     for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
     94 *         sum += iter->codePoint();
     95 *     }
     96 *     return sum;
     97 * }
     98 *
     99 * char32_t firstCodePointOrFFFD16(std::u16string_view s) {
    100 *     if (s.empty()) { return 0xfffd; }
    101 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
    102 *     return range.begin()->codePoint();
    103 * }
    104 *
    105 * std::string_view firstSequence8(std::string_view s) {
    106 *     if (s.empty()) { return {}; }
    107 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
    108 *     auto units = *(range.begin());
    109 *     if (units.wellFormed()) {
    110 *         return units.stringView();
    111 *     } else {
    112 *         return {};
    113 *     }
    114 * }
    115 *
    116 * template<typename InputStream>  // some istream or streambuf
    117 * std::u32string cpFromInput(InputStream &in) {
    118 *     // This is a single-pass input_iterator.
    119 *     std::istreambuf_iterator bufIter(in);
    120 *     std::istreambuf_iterator<typename InputStream::char_type> bufLimit;
    121 *     auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter);
    122 *     auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit);
    123 *     std::u32string s32;
    124 *     for (; iter != limit; ++iter) {
    125 *         s32.push_back(iter->codePoint());
    126 *     }
    127 *     return s32;
    128 * }
    129 *
    130 * std::u32string cpFromStdin() { return cpFromInput(std::cin); }
    131 * std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); }
    132 * \endcode
    133 */
    134 
    135 #ifndef U_HIDE_DRAFT_API
    136 
    137 /**
    138 * Some defined behaviors for handling ill-formed Unicode strings.
    139 * This is a template parameter for UTFIterator and related classes.
    140 *
    141 * When a validating UTFIterator encounters an ill-formed code unit sequence,
    142 * then CodeUnits.codePoint() is a value according to this parameter.
    143 *
    144 * @draft ICU 78
    145 * @see CodeUnits
    146 * @see UTFIterator
    147 * @see UTFStringCodePoints
    148 */
    149 typedef enum UTFIllFormedBehavior {
    150    /**
    151     * Returns a negative value (-1=U_SENTINEL) instead of a code point.
    152     * If the CP32 template parameter for the relevant classes is an unsigned type,
    153     * then the negative value becomes 0xffffffff=UINT32_MAX.
    154     *
    155     * @draft ICU 78
    156     */
    157    UTF_BEHAVIOR_NEGATIVE,
    158    /** Returns U+FFFD Replacement Character. @draft ICU 78 */
    159    UTF_BEHAVIOR_FFFD,
    160    /**
    161     * UTF-8: Not allowed;
    162     * UTF-16: returns the unpaired surrogate;
    163     * UTF-32: returns the surrogate code point, or U+FFFD if out of range.
    164     *
    165     * @draft ICU 78
    166     */
    167    UTF_BEHAVIOR_SURROGATE
    168 } UTFIllFormedBehavior;
    169 
    170 namespace U_HEADER_ONLY_NAMESPACE {
    171 
    172 namespace prv {
    173 #if U_CPLUSPLUS_VERSION >= 20
    174 
    175 /** @internal */
    176 template<typename Iter>
    177 using iter_value_t = typename std::iter_value_t<Iter>;
    178 
    179 /** @internal */
    180 template<typename Iter>
    181 using iter_difference_t = std::iter_difference_t<Iter>;
    182 
    183 /** @internal */
    184 template<typename Iter>
    185 constexpr bool forward_iterator = std::forward_iterator<Iter>;
    186 
    187 /** @internal */
    188 template<typename Iter>
    189 constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
    190 
    191 /** @internal */
    192 template<typename Range>
    193 constexpr bool range = std::ranges::range<Range>;
    194 
    195 #else
    196 
    197 /** @internal */
    198 template<typename Iter>
    199 using iter_value_t = typename std::iterator_traits<Iter>::value_type;
    200 
    201 /** @internal */
    202 template<typename Iter>
    203 using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
    204 
    205 /** @internal */
    206 template<typename Iter>
    207 constexpr bool forward_iterator =
    208    std::is_base_of_v<
    209        std::forward_iterator_tag,
    210        typename std::iterator_traits<Iter>::iterator_category>;
    211 
    212 /** @internal */
    213 template<typename Iter>
    214 constexpr bool bidirectional_iterator =
    215    std::is_base_of_v<
    216        std::bidirectional_iterator_tag,
    217        typename std::iterator_traits<Iter>::iterator_category>;
    218 
    219 /** @internal */
    220 template<typename Range, typename = void>
    221 struct range_type : std::false_type {};
    222 
    223 /** @internal */
    224 template<typename Range>
    225 struct range_type<
    226    Range,
    227    std::void_t<decltype(std::declval<Range>().begin()),
    228                decltype(std::declval<Range>().end())>> : std::true_type {};
    229 
    230 /** @internal */
    231 template<typename Range>
    232 constexpr bool range = range_type<Range>::value;
    233 
    234 #endif
    235 
    236 /** @internal */
    237 template <typename T> struct is_basic_string_view : std::false_type {};
    238 
    239 /** @internal */
    240 template <typename... Args>
    241 struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
    242 
    243 /** @internal */
    244 template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
    245 
    246 /** @internal */
    247 template<typename CP32, bool skipSurrogates>
    248 class CodePointsIterator {
    249    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    250 public:
    251    /** C++ iterator boilerplate @internal */
    252    using value_type = CP32;
    253    /** C++ iterator boilerplate @internal */
    254    using reference = value_type;
    255    /** C++ iterator boilerplate @internal */
    256    using pointer = CP32 *;
    257    /** C++ iterator boilerplate @internal */
    258    using difference_type = int32_t;
    259    /** C++ iterator boilerplate @internal */
    260    using iterator_category = std::forward_iterator_tag;
    261 
    262    /** @internal */
    263    inline CodePointsIterator(CP32 c) : c_(c) {}
    264    /** @internal */
    265    inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
    266    /** @internal */
    267    inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
    268    /** @internal */
    269    inline CP32 operator*() const { return c_; }
    270    /** @internal */
    271    inline CodePointsIterator &operator++() {  // pre-increment
    272        ++c_;
    273        if (skipSurrogates && c_ == 0xd800) {
    274            c_ = 0xe000;
    275        }
    276        return *this;
    277    }
    278    /** @internal */
    279    inline CodePointsIterator operator++(int) {  // post-increment
    280        CodePointsIterator result(*this);
    281        ++(*this);
    282        return result;
    283    }
    284 
    285 private:
    286    CP32 c_;
    287 };
    288 
    289 }  // namespace prv
    290 
    291 /**
    292 * A C++ "range" over all Unicode code points U+0000..U+10FFFF.
    293 * https://www.unicode.org/glossary/#code_point
    294 *
    295 * Intended for test and builder code.
    296 *
    297 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
    298 * @draft ICU 78
    299 * @see U_IS_CODE_POINT
    300 */
    301 template<typename CP32>
    302 class AllCodePoints {
    303    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    304 public:
    305    /** Constructor. @draft ICU 78 */
    306    AllCodePoints() {}
    307    /**
    308     * @return an iterator over all Unicode code points.
    309     *     The iterator returns CP32 integers.
    310     * @draft ICU 78
    311     */
    312    auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
    313    /**
    314     * @return an exclusive-end iterator over all Unicode code points.
    315     * @draft ICU 78
    316     */
    317    auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
    318 };
    319 
    320 /**
    321 * A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
    322 * That is, all code points except surrogates.
    323 * Only scalar values can be represented in well-formed UTF-8/16/32.
    324 * https://www.unicode.org/glossary/#unicode_scalar_value
    325 *
    326 * Intended for test and builder code.
    327 *
    328 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
    329 * @draft ICU 78
    330 * @see U_IS_SCALAR_VALUE
    331 */
    332 template<typename CP32>
    333 class AllScalarValues {
    334    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    335 public:
    336    /** Constructor. @draft ICU 78 */
    337    AllScalarValues() {}
    338    /**
    339     * @return an iterator over all Unicode scalar values.
    340     *     The iterator returns CP32 integers.
    341     * @draft ICU 78
    342     */
    343    auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
    344    /**
    345     * @return an exclusive-end iterator over all Unicode scalar values.
    346     * @draft ICU 78
    347     */
    348    auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
    349 };
    350 
    351 /**
    352 * Result of decoding a code unit sequence for one code point.
    353 * Returned from non-validating Unicode string code point iterators.
    354 * Base class for class CodeUnits which is returned from validating iterators.
    355 *
    356 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
    357 *              should be signed if UTF_BEHAVIOR_NEGATIVE
    358 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
    359 *     UTF-8: char or char8_t or uint8_t;
    360 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
    361 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
    362 * @see UnsafeUTFIterator
    363 * @see UnsafeUTFStringCodePoints
    364 * @draft ICU 78
    365 */
    366 template<typename CP32, typename UnitIter, typename = void>
    367 class UnsafeCodeUnits {
    368    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    369    using Unit = typename prv::iter_value_t<UnitIter>;
    370 public:
    371    /** @internal */
    372    UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
    373            c_(codePoint), len_(length), start_(start), limit_(limit) {}
    374 
    375    /** Copy constructor. @draft ICU 78 */
    376    UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
    377    /** Copy assignment operator. @draft ICU 78 */
    378    UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
    379 
    380    /**
    381     * @return the Unicode code point decoded from the code unit sequence.
    382     *     If the sequence is ill-formed and the iterator validates,
    383     *     then this is a replacement value according to the iterator‘s
    384     *     UTFIllFormedBehavior template parameter.
    385     * @draft ICU 78
    386     */
    387    CP32 codePoint() const { return c_; }
    388 
    389    /**
    390     * @return the start of the code unit sequence for one code point.
    391     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
    392     * @draft ICU 78
    393     */
    394    UnitIter begin() const { return start_; }
    395 
    396    /**
    397     * @return the limit (exclusive end) of the code unit sequence for one code point.
    398     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
    399     * @draft ICU 78
    400     */
    401    UnitIter end() const { return limit_; }
    402 
    403    /**
    404     * @return the length of the code unit sequence for one code point.
    405     * @draft ICU 78
    406     */
    407    uint8_t length() const { return len_; }
    408 
    409 #if U_CPLUSPLUS_VERSION >= 20
    410    /**
    411     * @return a string_view of the code unit sequence for one code point.
    412     * Only works if UnitIter is a pointer or a contiguous_iterator.
    413     * @draft ICU 78
    414     */
    415    template<std::contiguous_iterator Iter = UnitIter>
    416    std::basic_string_view<Unit> stringView() const {
    417        return std::basic_string_view<Unit>(begin(), end());
    418    }
    419 #else
    420    /**
    421     * @return a string_view of the code unit sequence for one code point.
    422     * Only works if UnitIter is a pointer or a contiguous_iterator.
    423     * @draft ICU 78
    424     */
    425    template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
    426    std::enable_if_t<std::is_pointer_v<Iter> ||
    427                         std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
    428                         std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
    429                         std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
    430                         std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
    431                     std::basic_string_view<Unit>>
    432    stringView() const {
    433        return std::basic_string_view<Unit>(&*start_, len_);
    434    }
    435 #endif
    436 
    437 private:
    438    // Order of fields with padding and access frequency in mind.
    439    CP32 c_;
    440    uint8_t len_;
    441    UnitIter start_;
    442    UnitIter limit_;
    443 };
    444 
    445 #ifndef U_IN_DOXYGEN
    446 // Partial template specialization for single-pass input iterator.
    447 // No UnitIter field, no getter for it, no stringView().
    448 template<typename CP32, typename UnitIter>
    449 class UnsafeCodeUnits<
    450        CP32,
    451        UnitIter,
    452        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
    453    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    454 public:
    455    UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
    456 
    457    UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
    458    UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
    459 
    460    CP32 codePoint() const { return c_; }
    461 
    462    uint8_t length() const { return len_; }
    463 
    464 private:
    465    // Order of fields with padding and access frequency in mind.
    466    CP32 c_;
    467    uint8_t len_;
    468 };
    469 #endif  // U_IN_DOXYGEN
    470 
    471 /**
    472 * Result of validating and decoding a code unit sequence for one code point.
    473 * Returned from validating Unicode string code point iterators.
    474 * Adds function wellFormed() to base class UnsafeCodeUnits.
    475 *
    476 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
    477 *              should be signed if UTF_BEHAVIOR_NEGATIVE
    478 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
    479 *     UTF-8: char or char8_t or uint8_t;
    480 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
    481 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
    482 * @see UTFIterator
    483 * @see UTFStringCodePoints
    484 * @draft ICU 78
    485 */
    486 template<typename CP32, typename UnitIter, typename = void>
    487 class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
    488 public:
    489    /** @internal */
    490    CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
    491            UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
    492 
    493    /** Copy constructor. @draft ICU 78 */
    494    CodeUnits(const CodeUnits &other) = default;
    495    /** Copy assignment operator. @draft ICU 78 */
    496    CodeUnits &operator=(const CodeUnits &other) = default;
    497 
    498    /**
    499     * @return true if the decoded code unit sequence is well-formed.
    500     * @draft ICU 78
    501     */
    502    bool wellFormed() const { return ok_; }
    503 
    504 private:
    505    bool ok_;
    506 };
    507 
    508 #ifndef U_IN_DOXYGEN
    509 // Partial template specialization for single-pass input iterator.
    510 // No UnitIter field, no getter for it, no stringView().
    511 template<typename CP32, typename UnitIter>
    512 class CodeUnits<
    513        CP32,
    514        UnitIter,
    515        std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
    516            public UnsafeCodeUnits<CP32, UnitIter> {
    517 public:
    518    CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
    519            UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
    520 
    521    CodeUnits(const CodeUnits &other) = default;
    522    CodeUnits &operator=(const CodeUnits &other) = default;
    523 
    524    bool wellFormed() const { return ok_; }
    525 
    526 private:
    527    bool ok_;
    528 };
    529 #endif  // U_IN_DOXYGEN
    530 
    531 // Validating implementations ---------------------------------------------- ***
    532 
    533 #ifndef U_IN_DOXYGEN
    534 template<typename CP32, UTFIllFormedBehavior behavior,
    535         typename UnitIter, typename LimitIter = UnitIter, typename = void>
    536 class UTFImpl;
    537 
    538 // Note: readAndInc() functions take both a p0 and a p iterator.
    539 // They must have the same value.
    540 // For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
    541 // and readAndInc() copies p0 and the incremented p into the CodeUnits.
    542 // For a single-pass UnitIter, which may not be default-constructible nor coypable,
    543 // the caller can pass p into both references, and readAndInc() does not use p0
    544 // and constructs CodeUnits without them.
    545 // Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
    546 // which may not be possible for a single-pass iterator.
    547 
    548 // UTF-8
    549 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
    550 class UTFImpl<
    551        CP32, behavior,
    552        UnitIter, LimitIter,
    553        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
    554    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    555    static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
    556                  "For 8-bit strings, the SURROGATE option does not have an equivalent.");
    557 public:
    558    // Handle ill-formed UTF-8
    559    U_FORCE_INLINE static CP32 sub() {
    560        switch (behavior) {
    561            case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
    562            case UTF_BEHAVIOR_FFFD: return 0xfffd;
    563        }
    564    }
    565 
    566    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
    567        // Very similar to U8_FWD_1().
    568        uint8_t b = *p;
    569        ++p;
    570        if (U8_IS_LEAD(b) && p != limit) {
    571            uint8_t t1 = *p;
    572            if ((0xe0 <= b && b < 0xf0)) {
    573                if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
    574                        ++p != limit && U8_IS_TRAIL(*p)) {
    575                    ++p;
    576                }
    577            } else if (b < 0xe0) {
    578                if (U8_IS_TRAIL(t1)) {
    579                    ++p;
    580                }
    581            } else /* b >= 0xf0 */ {
    582                if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
    583                        ++p != limit && U8_IS_TRAIL(*p) &&
    584                        ++p != limit && U8_IS_TRAIL(*p)) {
    585                    ++p;
    586                }
    587            }
    588        }
    589    }
    590 
    591    U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
    592        // Very similar to U8_BACK_1().
    593        uint8_t c = *--p;
    594        if (U8_IS_TRAIL(c) && p != start) {
    595            UnitIter p1 = p;
    596            uint8_t b1 = *--p1;
    597            if (U8_IS_LEAD(b1)) {
    598                if (b1 < 0xe0 ||
    599                        (b1 < 0xf0 ?
    600                            U8_IS_VALID_LEAD3_AND_T1(b1, c) :
    601                            U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
    602                    p = p1;
    603                    return;
    604                }
    605            } else if (U8_IS_TRAIL(b1) && p1 != start) {
    606                uint8_t b2 = *--p1;
    607                if (0xe0 <= b2 && b2 <= 0xf4) {
    608                    if (b2 < 0xf0 ?
    609                            U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
    610                            U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
    611                        p = p1;
    612                        return;
    613                    }
    614                } else if (U8_IS_TRAIL(b2) && p1 != start) {
    615                    uint8_t b3 = *--p1;
    616                    if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
    617                        p = p1;
    618                        return;
    619                    }
    620                }
    621            }
    622        }
    623    }
    624 
    625    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
    626            UnitIter &p0, UnitIter &p, const LimitIter &limit) {
    627        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
    628        // Very similar to U8_NEXT_OR_FFFD().
    629        CP32 c = uint8_t(*p);
    630        ++p;
    631        if (U8_IS_SINGLE(c)) {
    632            if constexpr (isMultiPass) {
    633                return {c, 1, true, p0, p};
    634            } else {
    635                return {c, 1, true};
    636            }
    637        }
    638        uint8_t length = 1;
    639        uint8_t t = 0;
    640        if (p != limit &&
    641                // fetch/validate/assemble all but last trail byte
    642                (c >= 0xe0 ?
    643                    (c < 0xf0 ?  // U+0800..U+FFFF except surrogates
    644                        U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
    645                        (t &= 0x3f, 1)
    646                    :  // U+10000..U+10FFFF
    647                        (c -= 0xf0) <= 4 &&
    648                        U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
    649                        (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
    650                        (t = *p - 0x80) <= 0x3f) &&
    651                    // valid second-to-last trail byte
    652                    (c = (c << 6) | t, ++length, ++p != limit)
    653                :  // U+0080..U+07FF
    654                    c >= 0xc2 && (c &= 0x1f, 1)) &&
    655                // last trail byte
    656                (t = *p - 0x80) <= 0x3f) {
    657            c = (c << 6) | t;
    658            ++length;
    659            ++p;
    660            if constexpr (isMultiPass) {
    661                return {c, length, true, p0, p};
    662            } else {
    663                return {c, length, true};
    664            }
    665        }
    666        if constexpr (isMultiPass) {
    667            return {sub(), length, false, p0, p};
    668        } else {
    669            return {sub(), length, false};
    670        }
    671    }
    672 
    673    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
    674        // Very similar to U8_PREV_OR_FFFD().
    675        UnitIter p0 = p;
    676        CP32 c = uint8_t(*--p);
    677        if (U8_IS_SINGLE(c)) {
    678            return {c, 1, true, p, p0};
    679        }
    680        if (U8_IS_TRAIL(c) && p != start) {
    681            UnitIter p1 = p;
    682            uint8_t b1 = *--p1;
    683            if (U8_IS_LEAD(b1)) {
    684                if (b1 < 0xe0) {
    685                    p = p1;
    686                    c = ((b1 - 0xc0) << 6) | (c & 0x3f);
    687                    return {c, 2, true, p, p0};
    688                } else if (b1 < 0xf0 ?
    689                            U8_IS_VALID_LEAD3_AND_T1(b1, c) :
    690                            U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
    691                    // Truncated 3- or 4-byte sequence.
    692                    p = p1;
    693                    return {sub(), 2, false, p, p0};
    694                }
    695            } else if (U8_IS_TRAIL(b1) && p1 != start) {
    696                // Extract the value bits from the last trail byte.
    697                c &= 0x3f;
    698                uint8_t b2 = *--p1;
    699                if (0xe0 <= b2 && b2 <= 0xf4) {
    700                    if (b2 < 0xf0) {
    701                        b2 &= 0xf;
    702                        if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
    703                            p = p1;
    704                            c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
    705                            return {c, 3, true, p, p0};
    706                        }
    707                    } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
    708                        // Truncated 4-byte sequence.
    709                        p = p1;
    710                        return {sub(), 3, false, p, p0};
    711                    }
    712                } else if (U8_IS_TRAIL(b2) && p1 != start) {
    713                    uint8_t b3 = *--p1;
    714                    if (0xf0 <= b3 && b3 <= 0xf4) {
    715                        b3 &= 7;
    716                        if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
    717                            p = p1;
    718                            c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
    719                            return {c, 4, true, p, p0};
    720                        }
    721                    }
    722                }
    723            }
    724        }
    725        return {sub(), 1, false, p, p0};
    726    }
    727 };
    728 
    729 // UTF-16
    730 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
    731 class UTFImpl<
    732        CP32, behavior,
    733        UnitIter, LimitIter,
    734        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
    735    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    736 public:
    737    // Handle ill-formed UTF-16: One unpaired surrogate.
    738    U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
    739        switch (behavior) {
    740            case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
    741            case UTF_BEHAVIOR_FFFD: return 0xfffd;
    742            case UTF_BEHAVIOR_SURROGATE: return surrogate;
    743        }
    744    }
    745 
    746    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
    747        // Very similar to U16_FWD_1().
    748        auto c = *p;
    749        ++p;
    750        if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
    751            ++p;
    752        }
    753    }
    754 
    755    U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
    756        // Very similar to U16_BACK_1().
    757        UnitIter p1;
    758        if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
    759            p = p1;
    760        }
    761    }
    762 
    763    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
    764            UnitIter &p0, UnitIter &p, const LimitIter &limit) {
    765        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
    766        // Very similar to U16_NEXT_OR_FFFD().
    767        CP32 c = static_cast<CP32>(*p);
    768        ++p;
    769        if (!U16_IS_SURROGATE(c)) {
    770            if constexpr (isMultiPass) {
    771                return {c, 1, true, p0, p};
    772            } else {
    773                return {c, 1, true};
    774            }
    775        } else {
    776            uint16_t c2;
    777            if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
    778                ++p;
    779                c = U16_GET_SUPPLEMENTARY(c, c2);
    780                if constexpr (isMultiPass) {
    781                    return {c, 2, true, p0, p};
    782                } else {
    783                    return {c, 2, true};
    784                }
    785            } else {
    786                if constexpr (isMultiPass) {
    787                    return {sub(c), 1, false, p0, p};
    788                } else {
    789                    return {sub(c), 1, false};
    790                }
    791            }
    792        }
    793    }
    794 
    795    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
    796        // Very similar to U16_PREV_OR_FFFD().
    797        UnitIter p0 = p;
    798        CP32 c = static_cast<CP32>(*--p);
    799        if (!U16_IS_SURROGATE(c)) {
    800            return {c, 1, true, p, p0};
    801        } else {
    802            UnitIter p1;
    803            uint16_t c2;
    804            if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
    805                p = p1;
    806                c = U16_GET_SUPPLEMENTARY(c2, c);
    807                return {c, 2, true, p, p0};
    808            } else {
    809                return {sub(c), 1, false, p, p0};
    810            }
    811        }
    812    }
    813 };
    814 
    815 // UTF-32: trivial, but still validating
    816 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
    817 class UTFImpl<
    818        CP32, behavior,
    819        UnitIter, LimitIter,
    820        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
    821    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    822 public:
    823    // Handle ill-formed UTF-32
    824    U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
    825        switch (behavior) {
    826            case UTF_BEHAVIOR_NEGATIVE: return U_SENTINEL;
    827            case UTF_BEHAVIOR_FFFD: return 0xfffd;
    828            case UTF_BEHAVIOR_SURROGATE: return forSurrogate ? surrogate : 0xfffd;
    829        }
    830    }
    831 
    832    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
    833        ++p;
    834    }
    835 
    836    U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
    837        --p;
    838    }
    839 
    840    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
    841            UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
    842        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
    843        uint32_t uc = *p;
    844        CP32 c = uc;
    845        ++p;
    846        if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
    847            if constexpr (isMultiPass) {
    848                return {c, 1, true, p0, p};
    849            } else {
    850                return {c, 1, true};
    851            }
    852        } else {
    853            if constexpr (isMultiPass) {
    854                return {sub(uc < 0xe000, c), 1, false, p0, p};
    855            } else {
    856                return {sub(uc < 0xe000, c), 1, false};
    857            }
    858        }
    859    }
    860 
    861    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
    862        UnitIter p0 = p;
    863        uint32_t uc = *--p;
    864        CP32 c = uc;
    865        if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
    866            return {c, 1, true, p, p0};
    867        } else {
    868            return {sub(uc < 0xe000, c), 1, false, p, p0};
    869        }
    870    }
    871 };
    872 
    873 // Non-validating implementations ------------------------------------------ ***
    874 
    875 template<typename CP32, typename UnitIter, typename = void>
    876 class UnsafeUTFImpl;
    877 
    878 // UTF-8
    879 template<typename CP32, typename UnitIter>
    880 class UnsafeUTFImpl<
    881        CP32,
    882        UnitIter,
    883        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
    884    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    885 public:
    886    U_FORCE_INLINE static void inc(UnitIter &p) {
    887        // Very similar to U8_FWD_1_UNSAFE().
    888        uint8_t b = *p;
    889        std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
    890    }
    891 
    892    U_FORCE_INLINE static void dec(UnitIter &p) {
    893        // Very similar to U8_BACK_1_UNSAFE().
    894        while (U8_IS_TRAIL(*--p)) {}
    895    }
    896 
    897    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
    898        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
    899        // Very similar to U8_NEXT_UNSAFE().
    900        CP32 c = uint8_t(*p);
    901        ++p;
    902        if (U8_IS_SINGLE(c)) {
    903            if constexpr (isMultiPass) {
    904                return {c, 1, p0, p};
    905            } else {
    906                return {c, 1};
    907            }
    908        } else if (c < 0xe0) {
    909            c = ((c & 0x1f) << 6) | (*p & 0x3f);
    910            ++p;
    911            if constexpr (isMultiPass) {
    912                return {c, 2, p0, p};
    913            } else {
    914                return {c, 2};
    915            }
    916        } else if (c < 0xf0) {
    917            // No need for (c&0xf) because the upper bits are truncated
    918            // after <<12 in the cast to uint16_t.
    919            c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
    920            ++p;
    921            c |= *p & 0x3f;
    922            ++p;
    923            if constexpr (isMultiPass) {
    924                return {c, 3, p0, p};
    925            } else {
    926                return {c, 3};
    927            }
    928        } else {
    929            c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
    930            ++p;
    931            c |= (*p & 0x3f) << 6;
    932            ++p;
    933            c |= *p & 0x3f;
    934            ++p;
    935            if constexpr (isMultiPass) {
    936                return {c, 4, p0, p};
    937            } else {
    938                return {c, 4};
    939            }
    940        }
    941    }
    942 
    943    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
    944        // Very similar to U8_PREV_UNSAFE().
    945        UnitIter p0 = p;
    946        CP32 c = uint8_t(*--p);
    947        if (U8_IS_SINGLE(c)) {
    948            return {c, 1, p, p0};
    949        }
    950        // U8_IS_TRAIL(c) if well-formed
    951        c &= 0x3f;
    952        uint8_t count = 1;
    953        for (uint8_t shift = 6;;) {
    954            uint8_t b = *--p;
    955            if (b >= 0xc0) {
    956                U8_MASK_LEAD_BYTE(b, count);
    957                c |= uint32_t{b} << shift;
    958                break;
    959            } else {
    960                c |= (uint32_t{b} & 0x3f) << shift;
    961                ++count;
    962                shift += 6;
    963            }
    964        }
    965        ++count;
    966        return {c, count, p, p0};
    967    }
    968 };
    969 
    970 // UTF-16
    971 template<typename CP32, typename UnitIter>
    972 class UnsafeUTFImpl<
    973        CP32,
    974        UnitIter,
    975        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
    976    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
    977 public:
    978    U_FORCE_INLINE static void inc(UnitIter &p) {
    979        // Very similar to U16_FWD_1_UNSAFE().
    980        auto c = *p;
    981        ++p;
    982        if (U16_IS_LEAD(c)) {
    983            ++p;
    984        }
    985    }
    986 
    987    U_FORCE_INLINE static void dec(UnitIter &p) {
    988        // Very similar to U16_BACK_1_UNSAFE().
    989        if (U16_IS_TRAIL(*--p)) {
    990            --p;
    991        }
    992    }
    993 
    994    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
    995        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
    996        // Very similar to U16_NEXT_UNSAFE().
    997        CP32 c = static_cast<CP32>(*p);
    998        ++p;
    999        if (!U16_IS_LEAD(c)) {
   1000            if constexpr (isMultiPass) {
   1001                return {c, 1, p0, p};
   1002            } else {
   1003                return {c, 1};
   1004            }
   1005        } else {
   1006            uint16_t c2 = *p;
   1007            ++p;
   1008            c = U16_GET_SUPPLEMENTARY(c, c2);
   1009            if constexpr (isMultiPass) {
   1010                return {c, 2, p0, p};
   1011            } else {
   1012                return {c, 2};
   1013            }
   1014        }
   1015    }
   1016 
   1017    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
   1018        // Very similar to U16_PREV_UNSAFE().
   1019        UnitIter p0 = p;
   1020        CP32 c = static_cast<CP32>(*--p);
   1021        if (!U16_IS_TRAIL(c)) {
   1022            return {c, 1, p, p0};
   1023        } else {
   1024            uint16_t c2 = *--p;
   1025            c = U16_GET_SUPPLEMENTARY(c2, c);
   1026            return {c, 2, p, p0};
   1027        }
   1028    }
   1029 };
   1030 
   1031 // UTF-32: trivial
   1032 template<typename CP32, typename UnitIter>
   1033 class UnsafeUTFImpl<
   1034        CP32,
   1035        UnitIter,
   1036        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
   1037    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1038 public:
   1039    U_FORCE_INLINE static void inc(UnitIter &p) {
   1040        ++p;
   1041    }
   1042 
   1043    U_FORCE_INLINE static void dec(UnitIter &p) {
   1044        --p;
   1045    }
   1046 
   1047    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
   1048        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
   1049        CP32 c = *p;
   1050        ++p;
   1051        if constexpr (isMultiPass) {
   1052            return {c, 1, p0, p};
   1053        } else {
   1054            return {c, 1};
   1055        }
   1056    }
   1057 
   1058    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
   1059        UnitIter p0 = p;
   1060        CP32 c = *--p;
   1061        return {c, 1, p, p0};
   1062    }
   1063 };
   1064 
   1065 #endif
   1066 
   1067 // Validating iterators ---------------------------------------------------- ***
   1068 
   1069 /**
   1070 * Validating iterator over the code points in a Unicode string.
   1071 *
   1072 * The UnitIter can be
   1073 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
   1074 * The UTFIterator will have the corresponding iterator_category.
   1075 *
   1076 * Call utfIterator() to have the compiler deduce the UnitIter and LimitIter types.
   1077 *
   1078 * For reverse iteration, either use this iterator directly as in <code>*--iter</code>
   1079 * or wrap it using std::make_reverse_iterator(iter).
   1080 *
   1081 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
   1082 *              should be signed if UTF_BEHAVIOR_NEGATIVE
   1083 * @tparam behavior How to handle ill-formed Unicode strings
   1084 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
   1085 *     UTF-8: char or char8_t or uint8_t;
   1086 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   1087 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   1088 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
   1089 * @draft ICU 78
   1090 * @see utfIterator
   1091 */
   1092 template<typename CP32, UTFIllFormedBehavior behavior,
   1093         typename UnitIter, typename LimitIter = UnitIter, typename = void>
   1094 class UTFIterator {
   1095    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1096    using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
   1097 
   1098    // Proxy type for operator->() (required by LegacyInputIterator)
   1099    // so that we don't promise always returning CodeUnits.
   1100    class Proxy {
   1101    public:
   1102        explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
   1103        CodeUnits<CP32, UnitIter> &operator*() { return units_; }
   1104        CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
   1105    private:
   1106        CodeUnits<CP32, UnitIter> units_;
   1107    };
   1108 
   1109 public:
   1110    /** C++ iterator boilerplate @internal */
   1111    using value_type = CodeUnits<CP32, UnitIter>;
   1112    /** C++ iterator boilerplate @internal */
   1113    using reference = value_type;
   1114    /** C++ iterator boilerplate @internal */
   1115    using pointer = Proxy;
   1116    /** C++ iterator boilerplate @internal */
   1117    using difference_type = prv::iter_difference_t<UnitIter>;
   1118    /** C++ iterator boilerplate @internal */
   1119    using iterator_category = std::conditional_t<
   1120        prv::bidirectional_iterator<UnitIter>,
   1121        std::bidirectional_iterator_tag,
   1122        std::forward_iterator_tag>;
   1123 
   1124    /**
   1125     * Constructor with start <= p < limit.
   1126     * All of these iterators/pointers should be at code point boundaries.
   1127     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
   1128     *
   1129     * When using a code unit sentinel (UnitIter≠LimitIter),
   1130     * then that sentinel also works as a sentinel for this code point iterator.
   1131     *
   1132     * @param start Start of the range
   1133     * @param p Initial position inside the range
   1134     * @param limit Limit (exclusive end) of the range
   1135     * @draft ICU 78
   1136     */
   1137    U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
   1138            p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
   1139    /**
   1140     * Constructor with start == p < limit.
   1141     * All of these iterators/pointers should be at code point boundaries.
   1142     *
   1143     * When using a code unit sentinel (UnitIter≠LimitIter),
   1144     * then that sentinel also works as a sentinel for this code point iterator.
   1145     *
   1146     * @param p Start of the range, and the initial position
   1147     * @param limit Limit (exclusive end) of the range
   1148     * @draft ICU 78
   1149     */
   1150    U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
   1151            p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
   1152    /**
   1153     * Constructs an iterator start or limit sentinel.
   1154     * The iterator/pointer should be at a code point boundary.
   1155     * Requires UnitIter to be copyable.
   1156     *
   1157     * When using a code unit sentinel (UnitIter≠LimitIter),
   1158     * then that sentinel also works as a sentinel for this code point iterator.
   1159     *
   1160     * @param p Range start or limit
   1161     * @draft ICU 78
   1162     */
   1163    U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
   1164    /**
   1165     * Default constructor. Makes a non-functional iterator.
   1166     *
   1167     * @draft ICU 78
   1168     */
   1169    U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
   1170 
   1171    /** Move constructor. @draft ICU 78 */
   1172    U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
   1173    /** Move assignment operator. @draft ICU 78 */
   1174    U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
   1175 
   1176    /** Copy constructor. @draft ICU 78 */
   1177    U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
   1178    /** Copy assignment operator. @draft ICU 78 */
   1179    U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
   1180 
   1181    /**
   1182     * @param other Another iterator
   1183     * @return true if this iterator is at the same position as the other one
   1184     * @draft ICU 78
   1185     */
   1186    U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
   1187        return getLogicalPosition() == other.getLogicalPosition();
   1188    }
   1189    /**
   1190     * @param other Another iterator
   1191     * @return true if this iterator is not at the same position as the other one
   1192     * @draft ICU 78
   1193     */
   1194    U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
   1195 
   1196    // Asymmetric equality & nonequality with a sentinel type.
   1197 
   1198    /**
   1199     * @param iter A UTFIterator
   1200     * @param s A unit iterator sentinel
   1201     * @return true if the iterator’s position is equal to the sentinel
   1202     * @draft ICU 78
   1203     */
   1204    template<typename Sentinel> U_FORCE_INLINE friend
   1205    std::enable_if_t<
   1206        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1207        bool>
   1208    operator==(const UTFIterator &iter, const Sentinel &s) {
   1209        return iter.getLogicalPosition() == s;
   1210    }
   1211 
   1212 #if U_CPLUSPLUS_VERSION < 20
   1213    // C++17: Need to define all four combinations of == / != vs. parameter order.
   1214    // Once we require C++20, we could remove all but the first == because
   1215    // the compiler would generate the rest.
   1216 
   1217    /**
   1218     * @param s A unit iterator sentinel
   1219     * @param iter A UTFIterator
   1220     * @return true if the iterator’s position is equal to the sentinel
   1221     * @internal
   1222     */
   1223    template<typename Sentinel> U_FORCE_INLINE friend
   1224    std::enable_if_t<
   1225        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1226        bool>
   1227    operator==(const Sentinel &s, const UTFIterator &iter) {
   1228        return iter.getLogicalPosition() == s;
   1229    }
   1230    /**
   1231     * @param iter A UTFIterator
   1232     * @param s A unit iterator sentinel
   1233     * @return true if the iterator’s position is not equal to the sentinel
   1234     * @internal
   1235     */
   1236    template<typename Sentinel> U_FORCE_INLINE friend
   1237    std::enable_if_t<
   1238        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1239        bool>
   1240    operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
   1241    /**
   1242     * @param s A unit iterator sentinel
   1243     * @param iter A UTFIterator
   1244     * @return true if the iterator’s position is not equal to the sentinel
   1245     * @internal
   1246     */
   1247    template<typename Sentinel> U_FORCE_INLINE friend
   1248    std::enable_if_t<
   1249        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1250        bool>
   1251    operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
   1252 #endif  // C++17
   1253 
   1254    /**
   1255     * Decodes the code unit sequence at the current position.
   1256     *
   1257     * @return CodeUnits with the decoded code point etc.
   1258     * @draft ICU 78
   1259     */
   1260    U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
   1261        if (state_ == 0) {
   1262            UnitIter p0 = p_;
   1263            units_ = Impl::readAndInc(p0, p_, limit_);
   1264            state_ = 1;
   1265        }
   1266        return units_;
   1267    }
   1268 
   1269    /**
   1270     * Decodes the code unit sequence at the current position.
   1271     * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
   1272     *
   1273     * @return CodeUnits with the decoded code point etc., wrapped into
   1274     *     an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
   1275     * @draft ICU 78
   1276     */
   1277    U_FORCE_INLINE Proxy operator->() const {
   1278        if (state_ == 0) {
   1279            UnitIter p0 = p_;
   1280            units_ = Impl::readAndInc(p0, p_, limit_);
   1281            state_ = 1;
   1282        }
   1283        return Proxy(units_);
   1284    }
   1285 
   1286    /**
   1287     * Pre-increment operator.
   1288     *
   1289     * @return this iterator
   1290     * @draft ICU 78
   1291     */
   1292    U_FORCE_INLINE UTFIterator &operator++() {  // pre-increment
   1293        if (state_ > 0) {
   1294            // operator*() called readAndInc() so p_ is already ahead.
   1295            state_ = 0;
   1296        } else if (state_ == 0) {
   1297            Impl::inc(p_, limit_);
   1298        } else /* state_ < 0 */ {
   1299            // operator--() called decAndRead() so we know how far to skip.
   1300            p_ = units_.end();
   1301            state_ = 0;
   1302        }
   1303        return *this;
   1304    }
   1305 
   1306    /**
   1307     * Post-increment operator.
   1308     *
   1309     * @return a copy of this iterator from before the increment.
   1310     *     If UnitIter is a single-pass input_iterator, then this function
   1311     *     returns an opaque proxy object so that <code>*iter++</code> still works.
   1312     * @draft ICU 78
   1313     */
   1314    U_FORCE_INLINE UTFIterator operator++(int) {  // post-increment
   1315        if (state_ > 0) {
   1316            // operator*() called readAndInc() so p_ is already ahead.
   1317            UTFIterator result(*this);
   1318            state_ = 0;
   1319            return result;
   1320        } else if (state_ == 0) {
   1321            UnitIter p0 = p_;
   1322            units_ = Impl::readAndInc(p0, p_, limit_);
   1323            UTFIterator result(*this);
   1324            result.state_ = 1;
   1325            // keep this->state_ == 0
   1326            return result;
   1327        } else /* state_ < 0 */ {
   1328            UTFIterator result(*this);
   1329            // operator--() called decAndRead() so we know how far to skip.
   1330            p_ = units_.end();
   1331            state_ = 0;
   1332            return result;
   1333        }
   1334    }
   1335 
   1336    /**
   1337     * Pre-decrement operator.
   1338     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
   1339     *
   1340     * @return this iterator
   1341     * @draft ICU 78
   1342     */
   1343    template<typename Iter = UnitIter>
   1344    U_FORCE_INLINE
   1345    std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
   1346    operator--() {  // pre-decrement
   1347        if (state_ > 0) {
   1348            // operator*() called readAndInc() so p_ is ahead of the logical position.
   1349            p_ = units_.begin();
   1350        }
   1351        units_ = Impl::decAndRead(start_, p_);
   1352        state_ = -1;
   1353        return *this;
   1354    }
   1355 
   1356    /**
   1357     * Post-decrement operator.
   1358     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
   1359     *
   1360     * @return a copy of this iterator from before the decrement.
   1361     * @draft ICU 78
   1362     */
   1363    template<typename Iter = UnitIter>
   1364    U_FORCE_INLINE
   1365    std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
   1366    operator--(int) {  // post-decrement
   1367        UTFIterator result(*this);
   1368        operator--();
   1369        return result;
   1370    }
   1371 
   1372 private:
   1373    friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
   1374 
   1375    U_FORCE_INLINE UnitIter getLogicalPosition() const {
   1376        return state_ <= 0 ? p_ : units_.begin();
   1377    }
   1378 
   1379    // operator*() etc. are logically const.
   1380    mutable UnitIter p_;
   1381    // In a validating iterator, we need start_ & limit_ so that when we read a code point
   1382    // (forward or backward) we can test if there are enough code units.
   1383    UnitIter start_;
   1384    LimitIter limit_;
   1385    // Keep state so that we call readAndInc() only once for both operator*() and ++
   1386    // to make it easy for the compiler to optimize.
   1387    mutable CodeUnits<CP32, UnitIter> units_;
   1388    // >0: units_ = readAndInc(), p_ = units limit
   1389    //     which means that p_ is ahead of its logical position
   1390    //  0: initial state
   1391    // <0: units_ = decAndRead(), p_ = units start
   1392    mutable int8_t state_ = 0;
   1393 };
   1394 
   1395 #ifndef U_IN_DOXYGEN
   1396 // Partial template specialization for single-pass input iterator.
   1397 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
   1398 class UTFIterator<
   1399        CP32, behavior,
   1400        UnitIter, LimitIter,
   1401        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
   1402    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1403    using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
   1404 
   1405    // Proxy type for post-increment return value, to make *iter++ work.
   1406    // Also for operator->() (required by LegacyInputIterator)
   1407    // so that we don't promise always returning CodeUnits.
   1408    class Proxy {
   1409    public:
   1410        explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
   1411        CodeUnits<CP32, UnitIter> &operator*() { return units_; }
   1412        CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
   1413    private:
   1414        CodeUnits<CP32, UnitIter> units_;
   1415    };
   1416 
   1417 public:
   1418    using value_type = CodeUnits<CP32, UnitIter>;
   1419    using reference = value_type;
   1420    using pointer = Proxy;
   1421    using difference_type = prv::iter_difference_t<UnitIter>;
   1422    using iterator_category = std::input_iterator_tag;
   1423 
   1424    U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
   1425 
   1426    // Constructs an iterator start or limit sentinel.
   1427    // Requires p to be copyable.
   1428    U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
   1429 
   1430    U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
   1431    U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
   1432 
   1433    U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
   1434    U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
   1435 
   1436    U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
   1437        return p_ == other.p_ && ahead_ == other.ahead_;
   1438        // Strictly speaking, we should check if the logical position is the same.
   1439        // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
   1440    }
   1441    U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
   1442 
   1443    template<typename Sentinel> U_FORCE_INLINE friend
   1444    std::enable_if_t<
   1445        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1446        bool>
   1447    operator==(const UTFIterator &iter, const Sentinel &s) {
   1448        return !iter.ahead_ && iter.p_ == s;
   1449    }
   1450 
   1451 #if U_CPLUSPLUS_VERSION < 20
   1452    template<typename Sentinel> U_FORCE_INLINE friend
   1453    std::enable_if_t<
   1454        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1455        bool>
   1456    operator==(const Sentinel &s, const UTFIterator &iter) {
   1457        return !iter.ahead_ && iter.p_ == s;
   1458    }
   1459 
   1460    template<typename Sentinel> U_FORCE_INLINE friend
   1461    std::enable_if_t<
   1462        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1463        bool>
   1464    operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
   1465 
   1466    template<typename Sentinel> U_FORCE_INLINE friend
   1467    std::enable_if_t<
   1468        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   1469        bool>
   1470    operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
   1471 #endif  // C++17
   1472 
   1473    U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
   1474        if (!ahead_) {
   1475            units_ = Impl::readAndInc(p_, p_, limit_);
   1476            ahead_ = true;
   1477        }
   1478        return units_;
   1479    }
   1480 
   1481    U_FORCE_INLINE Proxy operator->() const {
   1482        if (!ahead_) {
   1483            units_ = Impl::readAndInc(p_, p_, limit_);
   1484            ahead_ = true;
   1485        }
   1486        return Proxy(units_);
   1487    }
   1488 
   1489    U_FORCE_INLINE UTFIterator &operator++() {  // pre-increment
   1490        if (ahead_) {
   1491            // operator*() called readAndInc() so p_ is already ahead.
   1492            ahead_ = false;
   1493        } else {
   1494            Impl::inc(p_, limit_);
   1495        }
   1496        return *this;
   1497    }
   1498 
   1499    U_FORCE_INLINE Proxy operator++(int) {  // post-increment
   1500        if (ahead_) {
   1501            // operator*() called readAndInc() so p_ is already ahead.
   1502            ahead_ = false;
   1503        } else {
   1504            units_ = Impl::readAndInc(p_, p_, limit_);
   1505            // keep this->ahead_ == false
   1506        }
   1507        return Proxy(units_);
   1508    }
   1509 
   1510 private:
   1511    // operator*() etc. are logically const.
   1512    mutable UnitIter p_;
   1513    // In a validating iterator, we need limit_ so that when we read a code point
   1514    // we can test if there are enough code units.
   1515    LimitIter limit_;
   1516    // Keep state so that we call readAndInc() only once for both operator*() and ++
   1517    // so that we can use a single-pass input iterator for UnitIter.
   1518    mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
   1519    // true: units_ = readAndInc(), p_ = units limit
   1520    //     which means that p_ is ahead of its logical position
   1521    // false: initial state
   1522    mutable bool ahead_ = false;
   1523 };
   1524 #endif  // U_IN_DOXYGEN
   1525 
   1526 }  // namespace U_HEADER_ONLY_NAMESPACE
   1527 
   1528 #ifndef U_IN_DOXYGEN
   1529 // Bespoke specialization of reverse_iterator.
   1530 // The default implementation implements reverse operator*() and ++ in a way
   1531 // that does most of the same work twice for reading variable-length sequences.
   1532 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
   1533 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
   1534    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1535    using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
   1536    using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>;
   1537 
   1538    // Proxy type for operator->() (required by LegacyInputIterator)
   1539    // so that we don't promise always returning CodeUnits.
   1540    class Proxy {
   1541    public:
   1542        explicit Proxy(CodeUnits_ units) : units_(units) {}
   1543        CodeUnits_ &operator*() { return units_; }
   1544        CodeUnits_ *operator->() { return &units_; }
   1545    private:
   1546        CodeUnits_ units_;
   1547    };
   1548 
   1549 public:
   1550    using value_type = CodeUnits_;
   1551    using reference = value_type;
   1552    using pointer = Proxy;
   1553    using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
   1554    using iterator_category = std::bidirectional_iterator_tag;
   1555 
   1556    U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) :
   1557            p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
   1558            units_(0, 0, false, p_, p_) {}
   1559    U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
   1560 
   1561    U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
   1562    U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
   1563 
   1564    U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
   1565    U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
   1566 
   1567    U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
   1568        return getLogicalPosition() == other.getLogicalPosition();
   1569    }
   1570    U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
   1571 
   1572    U_FORCE_INLINE CodeUnits_ operator*() const {
   1573        if (state_ == 0) {
   1574            units_ = Impl::decAndRead(start_, p_);
   1575            state_ = -1;
   1576        }
   1577        return units_;
   1578    }
   1579 
   1580    U_FORCE_INLINE Proxy operator->() const {
   1581        if (state_ == 0) {
   1582            units_ = Impl::decAndRead(start_, p_);
   1583            state_ = -1;
   1584        }
   1585        return Proxy(units_);
   1586    }
   1587 
   1588    U_FORCE_INLINE reverse_iterator &operator++() {  // pre-increment
   1589        if (state_ < 0) {
   1590            // operator*() called decAndRead() so p_ is already behind.
   1591            state_ = 0;
   1592        } else if (state_ == 0) {
   1593            Impl::dec(start_, p_);
   1594        } else /* state_ > 0 */ {
   1595            // operator--() called readAndInc() so we know how far to skip.
   1596            p_ = units_.begin();
   1597            state_ = 0;
   1598        }
   1599        return *this;
   1600    }
   1601 
   1602    U_FORCE_INLINE reverse_iterator operator++(int) {  // post-increment
   1603        if (state_ < 0) {
   1604            // operator*() called decAndRead() so p_ is already behind.
   1605            reverse_iterator result(*this);
   1606            state_ = 0;
   1607            return result;
   1608        } else if (state_ == 0) {
   1609            units_ = Impl::decAndRead(start_, p_);
   1610            reverse_iterator result(*this);
   1611            result.state_ = -1;
   1612            // keep this->state_ == 0
   1613            return result;
   1614        } else /* state_ > 0 */ {
   1615            reverse_iterator result(*this);
   1616            // operator--() called readAndInc() so we know how far to skip.
   1617            p_ = units_.begin();
   1618            state_ = 0;
   1619            return result;
   1620        }
   1621    }
   1622 
   1623    U_FORCE_INLINE reverse_iterator &operator--() {  // pre-decrement
   1624        if (state_ < 0) {
   1625            // operator*() called decAndRead() so p_ is behind the logical position.
   1626            p_ = units_.end();
   1627        }
   1628        UnitIter p0 = p_;
   1629        units_ = Impl::readAndInc(p0, p_, limit_);
   1630        state_ = 1;
   1631        return *this;
   1632    }
   1633 
   1634    U_FORCE_INLINE reverse_iterator operator--(int) {  // post-decrement
   1635        reverse_iterator result(*this);
   1636        operator--();
   1637        return result;
   1638    }
   1639 
   1640 private:
   1641    U_FORCE_INLINE UnitIter getLogicalPosition() const {
   1642        return state_ >= 0 ? p_ : units_.end();
   1643    }
   1644 
   1645    // operator*() etc. are logically const.
   1646    mutable UnitIter p_;
   1647    // In a validating iterator, we need start_ & limit_ so that when we read a code point
   1648    // (forward or backward) we can test if there are enough code units.
   1649    UnitIter start_;
   1650    UnitIter limit_;
   1651    // Keep state so that we call decAndRead() only once for both operator*() and ++
   1652    // to make it easy for the compiler to optimize.
   1653    mutable CodeUnits_ units_;
   1654    // >0: units_ = readAndInc(), p_ = units limit
   1655    //  0: initial state
   1656    // <0: units_ = decAndRead(), p_ = units start
   1657    //     which means that p_ is behind its logical position
   1658    mutable int8_t state_ = 0;
   1659 };
   1660 #endif  // U_IN_DOXYGEN
   1661 
   1662 namespace U_HEADER_ONLY_NAMESPACE {
   1663 
   1664 /**
   1665 * UTFIterator factory function for start <= p < limit.
   1666 * Deduces the UnitIter and LimitIter template parameters from the inputs.
   1667 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
   1668 *
   1669 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   1670 * @tparam behavior How to handle ill-formed Unicode strings
   1671 * @tparam UnitIter Can usually be omitted/deduced:
   1672 *     An iterator (often a pointer) that returns a code unit type:
   1673 *     UTF-8: char or char8_t or uint8_t;
   1674 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   1675 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   1676 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
   1677 * @param start start code unit iterator
   1678 * @param p current-position code unit iterator
   1679 * @param limit limit (exclusive-end) code unit iterator.
   1680 *     When using a code unit sentinel (UnitIter≠LimitIter),
   1681 *     then that sentinel also works as a sentinel for the code point iterator.
   1682 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
   1683 *     for the given code unit iterators or character pointers
   1684 * @draft ICU 78
   1685 */
   1686 template<typename CP32, UTFIllFormedBehavior behavior,
   1687         typename UnitIter, typename LimitIter = UnitIter>
   1688 auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
   1689    return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
   1690        std::move(start), std::move(p), std::move(limit));
   1691 }
   1692 
   1693 /**
   1694 * UTFIterator factory function for start = p < limit.
   1695 * Deduces the UnitIter and LimitIter template parameters from the inputs.
   1696 *
   1697 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   1698 * @tparam behavior How to handle ill-formed Unicode strings
   1699 * @tparam UnitIter Can usually be omitted/deduced:
   1700 *     An iterator (often a pointer) that returns a code unit type:
   1701 *     UTF-8: char or char8_t or uint8_t;
   1702 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   1703 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   1704 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
   1705 * @param p start and current-position code unit iterator
   1706 * @param limit limit (exclusive-end) code unit iterator.
   1707 *     When using a code unit sentinel (UnitIter≠LimitIter),
   1708 *     then that sentinel also works as a sentinel for the code point iterator.
   1709 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
   1710 *     for the given code unit iterators or character pointers
   1711 * @draft ICU 78
   1712 */
   1713 template<typename CP32, UTFIllFormedBehavior behavior,
   1714         typename UnitIter, typename LimitIter = UnitIter>
   1715 auto utfIterator(UnitIter p, LimitIter limit) {
   1716    return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
   1717        std::move(p), std::move(limit));
   1718 }
   1719 
   1720 // Note: We should only enable the following factory function for a copyable UnitIter.
   1721 // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
   1722 // but a function template partial specialization is not allowed.
   1723 // In C++20, we might be able to require the std::copyable concept.
   1724 
   1725 /**
   1726 * UTFIterator factory function for a start or limit sentinel.
   1727 * Deduces the UnitIter template parameter from the input.
   1728 * Requires UnitIter to be copyable.
   1729 *
   1730 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   1731 * @tparam behavior How to handle ill-formed Unicode strings
   1732 * @tparam UnitIter Can usually be omitted/deduced:
   1733 *     An iterator (often a pointer) that returns a code unit type:
   1734 *     UTF-8: char or char8_t or uint8_t;
   1735 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   1736 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   1737 * @param p code unit iterator.
   1738 *     When using a code unit sentinel,
   1739 *     then that sentinel also works as a sentinel for the code point iterator.
   1740 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
   1741 *     for the given code unit iterator or character pointer
   1742 * @draft ICU 78
   1743 */
   1744 template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
   1745 auto utfIterator(UnitIter p) {
   1746    return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
   1747 }
   1748 
   1749 /**
   1750 * A C++ "range" for validating iteration over all of the code points of a code unit range.
   1751 *
   1752 * Call utfStringCodePoints() to have the compiler deduce the Range type.
   1753 *
   1754 * UTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
   1755 * so is UTFStringCodePoints<CP32, behavior, Range>.
   1756 * Note that when given a range r that is an lvalue and is not a view,  utfStringCodePoints(r) uses a
   1757 * ref_view of r as the Range type, which is a borrowed range.
   1758 * In practice, this means that given a container variable r, the iterators of utfStringCodePoints(r) can
   1759 * be used as long as iterators on r are valid, without having to keep utfStringCodePoints(r) around.
   1760 * For instance:
   1761 * \code
   1762 *     std::u8string s = "𒇧𒇧";
   1763 *     // it outlives utfStringCodePoints<char32_t>(s).
   1764 *     auto it = utfStringCodePoints<char32_t>(s).begin();
   1765 *     ++it;
   1766 *     char32_t second_code_point = it->codePoint();  // OK.
   1767 * \endcode
   1768 * 
   1769 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
   1770 *              should be signed if UTF_BEHAVIOR_NEGATIVE
   1771 * @tparam behavior How to handle ill-formed Unicode strings
   1772 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
   1773 * @draft ICU 78
   1774 * @see utfStringCodePoints
   1775 */
   1776 template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
   1777 class UTFStringCodePoints {
   1778    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1779 public:
   1780    /**
   1781     * Constructs an empty C++ "range" object.
   1782     * @draft ICU 78
   1783     */
   1784    UTFStringCodePoints() = default;
   1785 
   1786    /**
   1787     * Constructs a C++ "range" object over the code points in the string.
   1788     * @param unitRange input range
   1789     * @draft ICU 78
   1790     */
   1791    template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
   1792    explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
   1793    /**
   1794     * Constructs a C++ "range" object over the code points in the string,
   1795     * keeping a reference to the code unit range.  This overload is used by
   1796     * utfStringCodePoints in C++17; in C+20, a ref_view is used instead (via
   1797     * views::all).
   1798     * @param unitRange input range
   1799     * @draft ICU 78
   1800     */
   1801    template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
   1802    explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
   1803 
   1804    /** Copy constructor. @draft ICU 78 */
   1805    UTFStringCodePoints(const UTFStringCodePoints &other) = default;
   1806 
   1807    /** Copy assignment operator. @draft ICU 78 */
   1808    UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default;
   1809 
   1810    /**
   1811     * @return the range start iterator
   1812     * @draft ICU 78
   1813     */
   1814    auto begin() {
   1815        return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
   1816    }
   1817 
   1818    /**
   1819     * @return the range start iterator
   1820     * @draft ICU 78
   1821     */
   1822    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
   1823    auto begin() const {
   1824        return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
   1825    }
   1826 
   1827    /**
   1828     * @return the range limit (exclusive end) iterator
   1829     * @draft ICU 78
   1830     */
   1831    auto end() {
   1832        using UnitIter = decltype(unitRange.begin());
   1833        using LimitIter = decltype(unitRange.end());
   1834        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
   1835            // Return the code unit sentinel.
   1836            return unitRange.end();
   1837        } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
   1838            return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
   1839        } else {
   1840            // The input iterator specialization has no three-argument constructor.
   1841            return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
   1842        }
   1843    }
   1844 
   1845    /**
   1846     * @return the range limit (exclusive end) iterator
   1847     * @draft ICU 78
   1848     */
   1849    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
   1850    auto end() const {
   1851        using UnitIter = decltype(unitRange.begin());
   1852        using LimitIter = decltype(unitRange.end());
   1853        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
   1854            // Return the code unit sentinel.
   1855            return unitRange.end();
   1856        } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
   1857            return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
   1858        } else {
   1859            // The input iterator specialization has no three-argument constructor.
   1860            return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
   1861        }
   1862    }
   1863 
   1864    /**
   1865     * @return std::reverse_iterator(end())
   1866     * @draft ICU 78
   1867     */
   1868    auto rbegin() const {
   1869        return std::make_reverse_iterator(end());
   1870    }
   1871 
   1872    /**
   1873     * @return std::reverse_iterator(begin())
   1874     * @draft ICU 78
   1875     */
   1876    auto rend() const {
   1877        return std::make_reverse_iterator(begin());
   1878    }
   1879 
   1880 private:
   1881    Range unitRange;
   1882 };
   1883 
   1884 /** @internal */
   1885 template<typename CP32, UTFIllFormedBehavior behavior>
   1886 struct UTFStringCodePointsAdaptor
   1887 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 &&                                         \
   1888    __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
   1889    : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
   1890 #endif
   1891 {
   1892    /** @internal */
   1893    template<typename Range>
   1894    auto operator()(Range &&unitRange) const {
   1895 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10  // We need https://wg21.link/P2415R2.
   1896        return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
   1897            std::forward<Range>(unitRange));
   1898 #else
   1899        if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
   1900            // Take basic_string_view by copy, not by reference.  In C++20 this is handled by
   1901            // all_t<Range>, which is Range if Range is a view.
   1902            return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>(
   1903                std::forward<Range>(unitRange));
   1904        } else {
   1905            return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
   1906        }
   1907 #endif
   1908    }
   1909 };
   1910 
   1911 /**
   1912 * Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code
   1913 * points in a code unit range, which validates while decoding.
   1914 * Deduces the Range template parameter from the input, taking into account the value category: the
   1915 * code units will be referenced if possible, and moved if necessary.
   1916 *
   1917 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
   1918 *              should be signed if UTF_BEHAVIOR_NEGATIVE
   1919 * @tparam behavior How to handle ill-formed Unicode strings
   1920 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
   1921 * @param unitRange input range
   1922 * @return a UTFStringCodePoints&lt;CP32, behavior, Range&gt; for the given unitRange
   1923 * @draft ICU 78
   1924 */
   1925 template<typename CP32, UTFIllFormedBehavior behavior>
   1926 constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
   1927 
   1928 // Non-validating iterators ------------------------------------------------ ***
   1929 
   1930 /**
   1931 * Non-validating iterator over the code points in a Unicode string.
   1932 * The string must be well-formed.
   1933 *
   1934 * The UnitIter can be
   1935 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
   1936 * The UTFIterator will have the corresponding iterator_category.
   1937 *
   1938 * Call unsafeUTFIterator() to have the compiler deduce the UnitIter type.
   1939 *
   1940 * For reverse iteration, either use this iterator directly as in <code>*--iter</code>
   1941 * or wrap it using std::make_reverse_iterator(iter).
   1942 *
   1943 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   1944 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
   1945 *     UTF-8: char or char8_t or uint8_t;
   1946 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   1947 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   1948 * @draft ICU 78
   1949 * @see unsafeUTFIterator
   1950 */
   1951 template<typename CP32, typename UnitIter, typename = void>
   1952 class UnsafeUTFIterator {
   1953    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   1954    using Impl = UnsafeUTFImpl<CP32, UnitIter>;
   1955 
   1956    // Proxy type for operator->() (required by LegacyInputIterator)
   1957    // so that we don't promise always returning UnsafeCodeUnits.
   1958    class Proxy {
   1959    public:
   1960        explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
   1961        UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
   1962        UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
   1963    private:
   1964        UnsafeCodeUnits<CP32, UnitIter> units_;
   1965    };
   1966 
   1967 public:
   1968    /** C++ iterator boilerplate @internal */
   1969    using value_type = UnsafeCodeUnits<CP32, UnitIter>;
   1970    /** C++ iterator boilerplate @internal */
   1971    using reference = value_type;
   1972    /** C++ iterator boilerplate @internal */
   1973    using pointer = Proxy;
   1974    /** C++ iterator boilerplate @internal */
   1975    using difference_type = prv::iter_difference_t<UnitIter>;
   1976    /** C++ iterator boilerplate @internal */
   1977    using iterator_category = std::conditional_t<
   1978        prv::bidirectional_iterator<UnitIter>,
   1979        std::bidirectional_iterator_tag,
   1980        std::forward_iterator_tag>;
   1981 
   1982    /**
   1983     * Constructor; the iterator/pointer should be at a code point boundary.
   1984     *
   1985     * When using a code unit sentinel,
   1986     * then that sentinel also works as a sentinel for this code point iterator.
   1987     *
   1988     * @param p Initial position inside the range, or a range sentinel
   1989     * @draft ICU 78
   1990     */
   1991    U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
   1992    /**
   1993     * Default constructor. Makes a non-functional iterator.
   1994     *
   1995     * @draft ICU 78
   1996     */
   1997    U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
   1998 
   1999    /** Move constructor. @draft ICU 78 */
   2000    U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
   2001    /** Move assignment operator. @draft ICU 78 */
   2002    U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
   2003 
   2004    /** Copy constructor. @draft ICU 78 */
   2005    U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
   2006    /** Copy assignment operator. @draft ICU 78 */
   2007    U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
   2008 
   2009    /**
   2010     * @param other Another iterator
   2011     * @return true if this iterator is at the same position as the other one
   2012     * @draft ICU 78
   2013     */
   2014    U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
   2015        return getLogicalPosition() == other.getLogicalPosition();
   2016    }
   2017    /**
   2018     * @param other Another iterator
   2019     * @return true if this iterator is not at the same position as the other one
   2020     * @draft ICU 78
   2021     */
   2022    U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
   2023 
   2024    /**
   2025     * @param iter An UnsafeUTFIterator
   2026     * @param s A unit iterator sentinel
   2027     * @return true if the iterator’s position is equal to the sentinel
   2028     * @draft ICU 78
   2029     */
   2030    template<typename Sentinel> U_FORCE_INLINE friend
   2031    std::enable_if_t<
   2032        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2033        bool>
   2034    operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
   2035        return iter.getLogicalPosition() == s;
   2036    }
   2037 
   2038 #if U_CPLUSPLUS_VERSION < 20
   2039    /**
   2040     * @param s A unit iterator sentinel
   2041     * @param iter An UnsafeUTFIterator
   2042     * @return true if the iterator’s position is equal to the sentinel
   2043     * @internal
   2044     */
   2045    template<typename Sentinel> U_FORCE_INLINE friend
   2046    std::enable_if_t<
   2047        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2048        bool>
   2049    operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
   2050        return iter.getLogicalPosition() == s;
   2051    }
   2052    /**
   2053     * @param iter An UnsafeUTFIterator
   2054     * @param s A unit iterator sentinel
   2055     * @return true if the iterator’s position is not equal to the sentinel
   2056     * @internal
   2057     */
   2058    template<typename Sentinel> U_FORCE_INLINE friend
   2059    std::enable_if_t<
   2060        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2061        bool>
   2062    operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
   2063    /**
   2064     * @param s A unit iterator sentinel
   2065     * @param iter An UnsafeUTFIterator
   2066     * @return true if the iterator’s position is not equal to the sentinel
   2067     * @internal
   2068     */
   2069    template<typename Sentinel> U_FORCE_INLINE friend
   2070    std::enable_if_t<
   2071        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2072        bool>
   2073    operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
   2074 #endif  // C++17
   2075 
   2076    /**
   2077     * Decodes the code unit sequence at the current position.
   2078     *
   2079     * @return CodeUnits with the decoded code point etc.
   2080     * @draft ICU 78
   2081     */
   2082    U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
   2083        if (state_ == 0) {
   2084            UnitIter p0 = p_;
   2085            units_ = Impl::readAndInc(p0, p_);
   2086            state_ = 1;
   2087        }
   2088        return units_;
   2089    }
   2090 
   2091    /**
   2092     * Decodes the code unit sequence at the current position.
   2093     * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
   2094     *
   2095     * @return CodeUnits with the decoded code point etc., wrapped into
   2096     *     an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
   2097     * @draft ICU 78
   2098     */
   2099    U_FORCE_INLINE Proxy operator->() const {
   2100        if (state_ == 0) {
   2101            UnitIter p0 = p_;
   2102            units_ = Impl::readAndInc(p0, p_);
   2103            state_ = 1;
   2104        }
   2105        return Proxy(units_);
   2106    }
   2107 
   2108    /**
   2109     * Pre-increment operator.
   2110     *
   2111     * @return this iterator
   2112     * @draft ICU 78
   2113     */
   2114    U_FORCE_INLINE UnsafeUTFIterator &operator++() {  // pre-increment
   2115        if (state_ > 0) {
   2116            // operator*() called readAndInc() so p_ is already ahead.
   2117            state_ = 0;
   2118        } else if (state_ == 0) {
   2119            Impl::inc(p_);
   2120        } else /* state_ < 0 */ {
   2121            // operator--() called decAndRead() so we know how far to skip.
   2122            p_ = units_.end();
   2123            state_ = 0;
   2124        }
   2125        return *this;
   2126    }
   2127 
   2128    /**
   2129     * Post-increment operator.
   2130     *
   2131     * @return a copy of this iterator from before the increment.
   2132     *     If UnitIter is a single-pass input_iterator, then this function
   2133     *     returns an opaque proxy object so that <code>*iter++</code> still works.
   2134     * @draft ICU 78
   2135     */
   2136    U_FORCE_INLINE UnsafeUTFIterator operator++(int) {  // post-increment
   2137        if (state_ > 0) {
   2138            // operator*() called readAndInc() so p_ is already ahead.
   2139            UnsafeUTFIterator result(*this);
   2140            state_ = 0;
   2141            return result;
   2142        } else if (state_ == 0) {
   2143            UnitIter p0 = p_;
   2144            units_ = Impl::readAndInc(p0, p_);
   2145            UnsafeUTFIterator result(*this);
   2146            result.state_ = 1;
   2147            // keep this->state_ == 0
   2148            return result;
   2149        } else /* state_ < 0 */ {
   2150            UnsafeUTFIterator result(*this);
   2151            // operator--() called decAndRead() so we know how far to skip.
   2152            p_ = units_.end();
   2153            state_ = 0;
   2154            return result;
   2155        }
   2156    }
   2157 
   2158    /**
   2159     * Pre-decrement operator.
   2160     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
   2161     *
   2162     * @return this iterator
   2163     * @draft ICU 78
   2164     */
   2165    template<typename Iter = UnitIter>
   2166    U_FORCE_INLINE
   2167    std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
   2168    operator--() {  // pre-decrement
   2169        if (state_ > 0) {
   2170            // operator*() called readAndInc() so p_ is ahead of the logical position.
   2171            p_ = units_.begin();
   2172        }
   2173        units_ = Impl::decAndRead(p_);
   2174        state_ = -1;
   2175        return *this;
   2176    }
   2177 
   2178    /**
   2179     * Post-decrement operator.
   2180     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
   2181     *
   2182     * @return a copy of this iterator from before the decrement.
   2183     * @draft ICU 78
   2184     */
   2185    template<typename Iter = UnitIter>
   2186    U_FORCE_INLINE
   2187    std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
   2188    operator--(int) {  // post-decrement
   2189        UnsafeUTFIterator result(*this);
   2190        operator--();
   2191        return result;
   2192    }
   2193 
   2194 private:
   2195    friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
   2196 
   2197    U_FORCE_INLINE UnitIter getLogicalPosition() const {
   2198        return state_ <= 0 ? p_ : units_.begin();
   2199    }
   2200 
   2201    // operator*() etc. are logically const.
   2202    mutable UnitIter p_;
   2203    // Keep state so that we call readAndInc() only once for both operator*() and ++
   2204    // to make it easy for the compiler to optimize.
   2205    mutable UnsafeCodeUnits<CP32, UnitIter> units_;
   2206    // >0: units_ = readAndInc(), p_ = units limit
   2207    //     which means that p_ is ahead of its logical position
   2208    //  0: initial state
   2209    // <0: units_ = decAndRead(), p_ = units start
   2210    mutable int8_t state_ = 0;
   2211 };
   2212 
   2213 #ifndef U_IN_DOXYGEN
   2214 // Partial template specialization for single-pass input iterator.
   2215 template<typename CP32, typename UnitIter>
   2216 class UnsafeUTFIterator<
   2217        CP32,
   2218        UnitIter,
   2219        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
   2220    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   2221    using Impl = UnsafeUTFImpl<CP32, UnitIter>;
   2222 
   2223    // Proxy type for post-increment return value, to make *iter++ work.
   2224    // Also for operator->() (required by LegacyInputIterator)
   2225    // so that we don't promise always returning UnsafeCodeUnits.
   2226    class Proxy {
   2227    public:
   2228        explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
   2229        UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
   2230        UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
   2231    private:
   2232        UnsafeCodeUnits<CP32, UnitIter> units_;
   2233    };
   2234 
   2235 public:
   2236    using value_type = UnsafeCodeUnits<CP32, UnitIter>;
   2237    using reference = value_type;
   2238    using pointer = Proxy;
   2239    using difference_type = prv::iter_difference_t<UnitIter>;
   2240    using iterator_category = std::input_iterator_tag;
   2241 
   2242    U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
   2243 
   2244    U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
   2245    U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
   2246 
   2247    U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
   2248    U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
   2249 
   2250    U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
   2251        return p_ == other.p_ && ahead_ == other.ahead_;
   2252        // Strictly speaking, we should check if the logical position is the same.
   2253        // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
   2254    }
   2255    U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
   2256 
   2257    template<typename Sentinel> U_FORCE_INLINE friend
   2258    std::enable_if_t<
   2259        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2260        bool>
   2261    operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
   2262        return !iter.ahead_ && iter.p_ == s;
   2263    }
   2264 
   2265 #if U_CPLUSPLUS_VERSION < 20
   2266    template<typename Sentinel> U_FORCE_INLINE friend
   2267    std::enable_if_t<
   2268        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2269        bool>
   2270    operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
   2271        return !iter.ahead_ && iter.p_ == s;
   2272    }
   2273 
   2274    template<typename Sentinel> U_FORCE_INLINE friend
   2275    std::enable_if_t<
   2276        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2277        bool>
   2278    operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
   2279 
   2280    template<typename Sentinel> U_FORCE_INLINE friend
   2281    std::enable_if_t<
   2282        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
   2283        bool>
   2284    operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
   2285 #endif  // C++17
   2286 
   2287    U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
   2288        if (!ahead_) {
   2289            units_ = Impl::readAndInc(p_, p_);
   2290            ahead_ = true;
   2291        }
   2292        return units_;
   2293    }
   2294 
   2295    U_FORCE_INLINE Proxy operator->() const {
   2296        if (!ahead_) {
   2297            units_ = Impl::readAndInc(p_, p_);
   2298            ahead_ = true;
   2299        }
   2300        return Proxy(units_);
   2301    }
   2302 
   2303    U_FORCE_INLINE UnsafeUTFIterator &operator++() {  // pre-increment
   2304        if (ahead_) {
   2305            // operator*() called readAndInc() so p_ is already ahead.
   2306            ahead_ = false;
   2307        } else {
   2308            Impl::inc(p_);
   2309        }
   2310        return *this;
   2311    }
   2312 
   2313    U_FORCE_INLINE Proxy operator++(int) {  // post-increment
   2314        if (ahead_) {
   2315            // operator*() called readAndInc() so p_ is already ahead.
   2316            ahead_ = false;
   2317        } else {
   2318            units_ = Impl::readAndInc(p_, p_);
   2319            // keep this->ahead_ == false
   2320        }
   2321        return Proxy(units_);
   2322    }
   2323 
   2324 private:
   2325    // operator*() etc. are logically const.
   2326    mutable UnitIter p_;
   2327    // Keep state so that we call readAndInc() only once for both operator*() and ++
   2328    // so that we can use a single-pass input iterator for UnitIter.
   2329    mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
   2330    // true: units_ = readAndInc(), p_ = units limit
   2331    //     which means that p_ is ahead of its logical position
   2332    // false: initial state
   2333    mutable bool ahead_ = false;
   2334 };
   2335 #endif  // U_IN_DOXYGEN
   2336 
   2337 }  // namespace U_HEADER_ONLY_NAMESPACE
   2338 
   2339 #ifndef U_IN_DOXYGEN
   2340 // Bespoke specialization of reverse_iterator.
   2341 // The default implementation implements reverse operator*() and ++ in a way
   2342 // that does most of the same work twice for reading variable-length sequences.
   2343 template<typename CP32, typename UnitIter>
   2344 class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
   2345    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   2346    using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
   2347    using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>;
   2348 
   2349    // Proxy type for operator->() (required by LegacyInputIterator)
   2350    // so that we don't promise always returning UnsafeCodeUnits.
   2351    class Proxy {
   2352    public:
   2353        explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
   2354        UnsafeCodeUnits_ &operator*() { return units_; }
   2355        UnsafeCodeUnits_ *operator->() { return &units_; }
   2356    private:
   2357        UnsafeCodeUnits_ units_;
   2358    };
   2359 
   2360 public:
   2361    using value_type = UnsafeCodeUnits_;
   2362    using reference = value_type;
   2363    using pointer = Proxy;
   2364    using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
   2365    using iterator_category = std::bidirectional_iterator_tag;
   2366 
   2367    U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
   2368            p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
   2369    U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
   2370 
   2371    U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
   2372    U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
   2373 
   2374    U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
   2375    U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
   2376 
   2377    U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
   2378        return getLogicalPosition() == other.getLogicalPosition();
   2379    }
   2380    U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
   2381 
   2382    U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
   2383        if (state_ == 0) {
   2384            units_ = Impl::decAndRead(p_);
   2385            state_ = -1;
   2386        }
   2387        return units_;
   2388    }
   2389 
   2390    U_FORCE_INLINE Proxy operator->() const {
   2391        if (state_ == 0) {
   2392            units_ = Impl::decAndRead(p_);
   2393            state_ = -1;
   2394        }
   2395        return Proxy(units_);
   2396    }
   2397 
   2398    U_FORCE_INLINE reverse_iterator &operator++() {  // pre-increment
   2399        if (state_ < 0) {
   2400            // operator*() called decAndRead() so p_ is already behind.
   2401            state_ = 0;
   2402        } else if (state_ == 0) {
   2403            Impl::dec(p_);
   2404        } else /* state_ > 0 */ {
   2405            // operator--() called readAndInc() so we know how far to skip.
   2406            p_ = units_.begin();
   2407            state_ = 0;
   2408        }
   2409        return *this;
   2410    }
   2411 
   2412    U_FORCE_INLINE reverse_iterator operator++(int) {  // post-increment
   2413        if (state_ < 0) {
   2414            // operator*() called decAndRead() so p_ is already behind.
   2415            reverse_iterator result(*this);
   2416            state_ = 0;
   2417            return result;
   2418        } else if (state_ == 0) {
   2419            units_ = Impl::decAndRead(p_);
   2420            reverse_iterator result(*this);
   2421            result.state_ = -1;
   2422            // keep this->state_ == 0
   2423            return result;
   2424        } else /* state_ > 0 */ {
   2425            reverse_iterator result(*this);
   2426            // operator--() called readAndInc() so we know how far to skip.
   2427            p_ = units_.begin();
   2428            state_ = 0;
   2429            return result;
   2430        }
   2431    }
   2432 
   2433    U_FORCE_INLINE reverse_iterator &operator--() {  // pre-decrement
   2434        if (state_ < 0) {
   2435            // operator*() called decAndRead() so p_ is behind the logical position.
   2436            p_ = units_.end();
   2437        }
   2438        UnitIter p0 = p_;
   2439        units_ = Impl::readAndInc(p0, p_);
   2440        state_ = 1;
   2441        return *this;
   2442    }
   2443 
   2444    U_FORCE_INLINE reverse_iterator operator--(int) {  // post-decrement
   2445        reverse_iterator result(*this);
   2446        operator--();
   2447        return result;
   2448    }
   2449 
   2450 private:
   2451    U_FORCE_INLINE UnitIter getLogicalPosition() const {
   2452        return state_ >= 0 ? p_ : units_.end();
   2453    }
   2454 
   2455    // operator*() etc. are logically const.
   2456    mutable UnitIter p_;
   2457    // Keep state so that we call decAndRead() only once for both operator*() and ++
   2458    // to make it easy for the compiler to optimize.
   2459    mutable UnsafeCodeUnits_ units_;
   2460    // >0: units_ = readAndInc(), p_ = units limit
   2461    //  0: initial state
   2462    // <0: units_ = decAndRead(), p_ = units start
   2463    //     which means that p_ is behind its logical position
   2464    mutable int8_t state_ = 0;
   2465 };
   2466 #endif  // U_IN_DOXYGEN
   2467 
   2468 namespace U_HEADER_ONLY_NAMESPACE {
   2469 
   2470 /**
   2471 * UnsafeUTFIterator factory function.
   2472 * Deduces the UnitIter template parameter from the input.
   2473 *
   2474 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   2475 * @tparam UnitIter Can usually be omitted/deduced:
   2476 *     An iterator (often a pointer) that returns a code unit type:
   2477 *     UTF-8: char or char8_t or uint8_t;
   2478 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
   2479 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
   2480 * @param iter code unit iterator
   2481 * @return an UnsafeUTFIterator&lt;CP32, UnitIter&gt;
   2482 *     for the given code unit iterator or character pointer
   2483 * @draft ICU 78
   2484 */
   2485 template<typename CP32, typename UnitIter>
   2486 auto unsafeUTFIterator(UnitIter iter) {
   2487    return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
   2488 }
   2489 
   2490 /**
   2491 * A C++ "range" for non-validating iteration over all of the code points of a code unit range.
   2492 * The string must be well-formed.
   2493 *
   2494 * Call unsafeUTFStringCodePoints() to have the compiler deduce the Range type.
   2495 *
   2496 * UnsafeUTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
   2497 * so is UnsafeUTFStringCodePoints<CP32, behavior, Range>.
   2498 * Note that when given a range r that is an lvalue and is not a view,  unsafeUTFStringCodePoints(r) uses
   2499 * a ref_view of r as the Range type, which is a borrowed range.
   2500 * In practice, this means that given a container variable r, the iterators of
   2501 * unsafeUTFStringCodePoints(r) can be used as long as iterators on r are valid, without having to keep
   2502 * unsafeUTFStringCodePoints(r) around.
   2503 * For instance:
   2504 * \code
   2505 *     std::u8string s = "𒇧𒇧";
   2506 *     // it outlives unsafeUTFStringCodePoints<char32_t>(s).
   2507 *     auto it = unsafeUTFStringCodePoints<char32_t>(s).begin();
   2508 *     ++it;
   2509 *     char32_t second_code_point = it->codePoint();  // OK.
   2510 * \endcode
   2511 *
   2512 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   2513 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
   2514 * @draft ICU 78
   2515 * @see unsafeUTFStringCodePoints
   2516 */
   2517 template<typename CP32, typename Range>
   2518 class UnsafeUTFStringCodePoints {
   2519    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
   2520 public:
   2521    /**
   2522     * Constructs an empty C++ "range" object.
   2523     * @draft ICU 78
   2524     */
   2525    UnsafeUTFStringCodePoints() = default;
   2526 
   2527    /**
   2528     * Constructs a C++ "range" object over the code points in the string.
   2529     * @param unitRange input range
   2530     * @draft ICU 78
   2531     */
   2532    template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
   2533    explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
   2534    /**
   2535     * Constructs a C++ "range" object over the code points in the string,
   2536     * keeping a reference to the code unit range.  This overload is used by
   2537     * utfStringCodePoints in C++17; in C++20, a ref_view is used instead (via
   2538     * views::all).
   2539     * @param unitRange input range
   2540     * @draft ICU 78
   2541     */
   2542    template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
   2543    explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
   2544 
   2545    /** Copy constructor. @draft ICU 78 */
   2546    UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default;
   2547 
   2548    /** Copy assignment operator. @draft ICU 78 */
   2549    UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default;
   2550 
   2551    /**
   2552     * @return the range start iterator
   2553     * @draft ICU 78
   2554     */
   2555    auto begin() {
   2556        return unsafeUTFIterator<CP32>(unitRange.begin());
   2557    }
   2558 
   2559    /**
   2560     * @return the range start iterator
   2561     * @draft ICU 78
   2562     */
   2563    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
   2564    auto begin() const {
   2565        return unsafeUTFIterator<CP32>(unitRange.begin());
   2566    }
   2567 
   2568    /**
   2569     * @return the range limit (exclusive end) iterator
   2570     * @draft ICU 78
   2571     */
   2572    auto end() {
   2573        using UnitIter = decltype(unitRange.begin());
   2574        using LimitIter = decltype(unitRange.end());
   2575        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
   2576            // Return the code unit sentinel.
   2577            return unitRange.end();
   2578        } else {
   2579            return unsafeUTFIterator<CP32>(unitRange.end());
   2580        }
   2581    }
   2582 
   2583    /**
   2584     * @return the range limit (exclusive end) iterator
   2585     * @draft ICU 78
   2586     */
   2587    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
   2588    auto end() const {
   2589        using UnitIter = decltype(unitRange.begin());
   2590        using LimitIter = decltype(unitRange.end());
   2591        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
   2592            // Return the code unit sentinel.
   2593            return unitRange.end();
   2594        } else {
   2595            return unsafeUTFIterator<CP32>(unitRange.end());
   2596        }
   2597    }
   2598 
   2599    /**
   2600     * @return std::reverse_iterator(end())
   2601     * @draft ICU 78
   2602     */
   2603    auto rbegin() const {
   2604        return std::make_reverse_iterator(end());
   2605    }
   2606 
   2607    /**
   2608     * @return std::reverse_iterator(begin())
   2609     * @draft ICU 78
   2610     */
   2611    auto rend() const {
   2612        return std::make_reverse_iterator(begin());
   2613    }
   2614 
   2615 private:
   2616    Range unitRange;
   2617 };
   2618 
   2619 /** @internal */
   2620 template<typename CP32>
   2621 struct UnsafeUTFStringCodePointsAdaptor
   2622 #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 &&                                         \
   2623    __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
   2624    : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
   2625 #endif
   2626 {
   2627    /** @internal */
   2628    template<typename Range>
   2629    auto operator()(Range &&unitRange) const {
   2630 #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10  // We need https://wg21.link/P2415R2.
   2631        return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
   2632 #else
   2633        if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
   2634            // Take basic_string_view by copy, not by reference.  In C++20 this is handled by
   2635            // all_t<Range>, which is Range if Range is a view.
   2636            return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
   2637        } else {
   2638            return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
   2639        }
   2640 #endif
   2641    }
   2642 };
   2643 
   2644 
   2645 /**
   2646 * Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a
   2647 * "range" of code points in a code unit range. The string must be well-formed.
   2648 * Deduces the Range template parameter from the input, taking into account the value category: the
   2649 * code units will be referenced if possible, and moved if necessary.
   2650 *
   2651 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
   2652 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
   2653 * @param unitRange input range
   2654 * @return an UnsafeUTFStringCodePoints&lt;CP32, Range&gt; for the given unitRange
   2655 * @draft ICU 78
   2656 */
   2657 template<typename CP32>
   2658 constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
   2659 
   2660 }  // namespace U_HEADER_ONLY_NAMESPACE
   2661 
   2662 
   2663 #if defined(__cpp_lib_ranges)
   2664 template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
   2665 constexpr bool std::ranges::enable_borrowed_range<
   2666    U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> =
   2667    std::ranges::enable_borrowed_range<Range>;
   2668 
   2669 template <typename CP32, typename Range>
   2670 constexpr bool std::ranges::enable_borrowed_range<
   2671    U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> =
   2672    std::ranges::enable_borrowed_range<Range>;
   2673 #endif
   2674 
   2675 #endif  // U_HIDE_DRAFT_API
   2676 #endif  // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
   2677 #endif  // __UTFITERATOR_H__