Segmenter.h (6465B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 5 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* Classes to iterate over grapheme, word, sentence, or line. */ 8 9 #ifndef intl_components_Segmenter_h_ 10 #define intl_components_Segmenter_h_ 11 12 #include "mozilla/intl/ICUError.h" 13 #include "mozilla/Maybe.h" 14 #include "mozilla/Result.h" 15 #include "mozilla/Span.h" 16 #include "mozilla/UniquePtr.h" 17 18 namespace icu4x::capi { 19 struct LineSegmenter; 20 struct LineBreakIteratorUtf16; 21 struct WordSegmenter; 22 struct WordBreakIteratorUtf16; 23 struct GraphemeClusterSegmenter; 24 struct GraphemeClusterBreakIteratorUtf16; 25 struct SentenceSegmenter; 26 struct SentenceBreakIteratorUtf16; 27 } // namespace icu4x::capi 28 29 namespace mozilla::intl { 30 31 enum class SegmenterGranularity : uint8_t { 32 Grapheme, 33 Word, 34 Sentence, 35 Line, 36 }; 37 38 struct SegmenterOptions final { 39 SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme; 40 }; 41 42 /** 43 * Interface of segment iterators. Subclass this class to implement iterator for 44 * UTF-16 text. 45 */ 46 class SegmentIteratorUtf16 { 47 public: 48 virtual ~SegmentIteratorUtf16() = default; 49 50 // Disable copy or move semantics. Move semantic could be enabled in the 51 // future if needed. 52 SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete; 53 SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete; 54 SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete; 55 SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete; 56 57 /** 58 * Advance the iterator to the next break position. 59 * 60 * @return the break position. If there's no further break position, return 61 * Nothing(). 62 */ 63 virtual Maybe<uint32_t> Next() = 0; 64 65 /** 66 * Advance the iterator to the first break position following the specified 67 * position aPos. 68 * 69 * Note: if this iterator's current position is already >= aPos, this method 70 * behaves the same as Next(). 71 */ 72 virtual Maybe<uint32_t> Seek(uint32_t aPos); 73 74 protected: 75 explicit SegmentIteratorUtf16(Span<const char16_t> aText); 76 77 // The text to iterate over. 78 Span<const char16_t> mText; 79 80 // The current break position within mText. 81 uint32_t mPos = 0; 82 }; 83 84 // Each enum value has the same meaning with respect to the `word-break` 85 // property values in the CSS Text spec. See the details in 86 // https://drafts.csswg.org/css-text-3/#word-break-property 87 enum class WordBreakRule : uint8_t { 88 Normal = 0, 89 BreakAll, 90 KeepAll, 91 }; 92 93 // Each enum value has the same meaning with respect to the `line-break` 94 // property values in the CSS Text spec. See the details in 95 // https://drafts.csswg.org/css-text-3/#line-break-property. 96 enum class LineBreakRule : uint8_t { 97 Auto = 0, 98 Loose, 99 Normal, 100 Strict, 101 Anywhere, 102 }; 103 104 // Extra options for line break iterator. 105 struct LineBreakOptions final { 106 WordBreakRule mWordBreakRule = WordBreakRule::Normal; 107 LineBreakRule mLineBreakRule = LineBreakRule::Auto; 108 bool mScriptIsChineseOrJapanese = false; 109 }; 110 111 /** 112 * Line break iterator for UTF-16 text. 113 */ 114 class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { 115 public: 116 explicit LineBreakIteratorUtf16(Span<const char16_t> aText, 117 const LineBreakOptions& aOptions = {}); 118 ~LineBreakIteratorUtf16() override; 119 120 Maybe<uint32_t> Next() override; 121 Maybe<uint32_t> Seek(uint32_t aPos) override; 122 123 private: 124 LineBreakOptions mOptions; 125 126 icu4x::capi::LineSegmenter* mSegmenter = nullptr; 127 icu4x::capi::LineBreakIteratorUtf16* mIterator = nullptr; 128 }; 129 130 /** 131 * Word break iterator for UTF-16 text. 132 */ 133 class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { 134 public: 135 explicit WordBreakIteratorUtf16(Span<const char16_t> aText); 136 ~WordBreakIteratorUtf16() override; 137 138 void Reset(Span<const char16_t> aText); 139 Maybe<uint32_t> Next() override; 140 Maybe<uint32_t> Seek(uint32_t aPos) override; 141 142 private: 143 icu4x::capi::WordSegmenter* mSegmenter = nullptr; 144 icu4x::capi::WordBreakIteratorUtf16* mIterator = nullptr; 145 }; 146 147 /** 148 * Grapheme cluster break iterator for UTF-16 text. 149 */ 150 class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { 151 public: 152 explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText); 153 ~GraphemeClusterBreakIteratorUtf16() override; 154 155 Maybe<uint32_t> Next() override; 156 Maybe<uint32_t> Seek(uint32_t aPos) override; 157 158 private: 159 static icu4x::capi::GraphemeClusterSegmenter* sSegmenter; 160 icu4x::capi::GraphemeClusterBreakIteratorUtf16* mIterator = nullptr; 161 }; 162 163 /** 164 * Grapheme cluster break reverse iterator for UTF-16 text. 165 * 166 * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it 167 * at your own risk. 168 */ 169 class GraphemeClusterBreakReverseIteratorUtf16 final 170 : public SegmentIteratorUtf16 { 171 public: 172 explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText); 173 174 Maybe<uint32_t> Next() override; 175 Maybe<uint32_t> Seek(uint32_t aPos) override; 176 }; 177 178 /** 179 * Sentence break iterator for UTF-16 text. 180 */ 181 class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { 182 public: 183 explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText); 184 ~SentenceBreakIteratorUtf16() override; 185 186 Maybe<uint32_t> Next() override; 187 Maybe<uint32_t> Seek(uint32_t aPos) override; 188 189 private: 190 icu4x::capi::SentenceSegmenter* mSegmenter = nullptr; 191 icu4x::capi::SentenceBreakIteratorUtf16* mIterator = nullptr; 192 }; 193 194 /** 195 * This component is a Mozilla-focused API for working with segmenters in 196 * internationalization code. 197 * 198 * This is a factor class. Calling Segment() to create an iterator over a text 199 * of given granularity. 200 */ 201 class Segmenter final { 202 public: 203 // NOTE: aLocale is a no-op currently. 204 static Result<UniquePtr<Segmenter>, ICUError> TryCreate( 205 Span<const char> aLocale, const SegmenterOptions& aOptions); 206 207 explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions) 208 : mOptions(aOptions) {} 209 210 // Creates an iterator over aText of a given granularity in mOptions. 211 UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const; 212 213 // TODO: Implement an iterator for Latin1 text. 214 // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const; 215 216 private: 217 SegmenterOptions mOptions; 218 }; 219 220 } // namespace mozilla::intl 221 222 #endif