tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Segmenter.h (6465B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
      5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /* Classes to iterate over grapheme, word, sentence, or line. */
      8 
      9 #ifndef intl_components_Segmenter_h_
     10 #define intl_components_Segmenter_h_
     11 
     12 #include "mozilla/intl/ICUError.h"
     13 #include "mozilla/Maybe.h"
     14 #include "mozilla/Result.h"
     15 #include "mozilla/Span.h"
     16 #include "mozilla/UniquePtr.h"
     17 
     18 namespace icu4x::capi {
     19 struct LineSegmenter;
     20 struct LineBreakIteratorUtf16;
     21 struct WordSegmenter;
     22 struct WordBreakIteratorUtf16;
     23 struct GraphemeClusterSegmenter;
     24 struct GraphemeClusterBreakIteratorUtf16;
     25 struct SentenceSegmenter;
     26 struct SentenceBreakIteratorUtf16;
     27 }  // namespace icu4x::capi
     28 
     29 namespace mozilla::intl {
     30 
     31 enum class SegmenterGranularity : uint8_t {
     32  Grapheme,
     33  Word,
     34  Sentence,
     35  Line,
     36 };
     37 
     38 struct SegmenterOptions final {
     39  SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme;
     40 };
     41 
     42 /**
     43 * Interface of segment iterators. Subclass this class to implement iterator for
     44 * UTF-16 text.
     45 */
     46 class SegmentIteratorUtf16 {
     47 public:
     48  virtual ~SegmentIteratorUtf16() = default;
     49 
     50  // Disable copy or move semantics. Move semantic could be enabled in the
     51  // future if needed.
     52  SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete;
     53  SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete;
     54  SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete;
     55  SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete;
     56 
     57  /**
     58   * Advance the iterator to the next break position.
     59   *
     60   * @return the break position. If there's no further break position, return
     61   * Nothing().
     62   */
     63  virtual Maybe<uint32_t> Next() = 0;
     64 
     65  /**
     66   * Advance the iterator to the first break position following the specified
     67   * position aPos.
     68   *
     69   * Note: if this iterator's current position is already >= aPos, this method
     70   * behaves the same as Next().
     71   */
     72  virtual Maybe<uint32_t> Seek(uint32_t aPos);
     73 
     74 protected:
     75  explicit SegmentIteratorUtf16(Span<const char16_t> aText);
     76 
     77  // The text to iterate over.
     78  Span<const char16_t> mText;
     79 
     80  // The current break position within mText.
     81  uint32_t mPos = 0;
     82 };
     83 
     84 // Each enum value has the same meaning with respect to the `word-break`
     85 // property values in the CSS Text spec. See the details in
     86 // https://drafts.csswg.org/css-text-3/#word-break-property
     87 enum class WordBreakRule : uint8_t {
     88  Normal = 0,
     89  BreakAll,
     90  KeepAll,
     91 };
     92 
     93 // Each enum value has the same meaning with respect to the `line-break`
     94 // property values in the CSS Text spec. See the details in
     95 // https://drafts.csswg.org/css-text-3/#line-break-property.
     96 enum class LineBreakRule : uint8_t {
     97  Auto = 0,
     98  Loose,
     99  Normal,
    100  Strict,
    101  Anywhere,
    102 };
    103 
    104 // Extra options for line break iterator.
    105 struct LineBreakOptions final {
    106  WordBreakRule mWordBreakRule = WordBreakRule::Normal;
    107  LineBreakRule mLineBreakRule = LineBreakRule::Auto;
    108  bool mScriptIsChineseOrJapanese = false;
    109 };
    110 
    111 /**
    112 * Line break iterator for UTF-16 text.
    113 */
    114 class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
    115 public:
    116  explicit LineBreakIteratorUtf16(Span<const char16_t> aText,
    117                                  const LineBreakOptions& aOptions = {});
    118  ~LineBreakIteratorUtf16() override;
    119 
    120  Maybe<uint32_t> Next() override;
    121  Maybe<uint32_t> Seek(uint32_t aPos) override;
    122 
    123 private:
    124  LineBreakOptions mOptions;
    125 
    126  icu4x::capi::LineSegmenter* mSegmenter = nullptr;
    127  icu4x::capi::LineBreakIteratorUtf16* mIterator = nullptr;
    128 };
    129 
    130 /**
    131 * Word break iterator for UTF-16 text.
    132 */
    133 class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
    134 public:
    135  explicit WordBreakIteratorUtf16(Span<const char16_t> aText);
    136  ~WordBreakIteratorUtf16() override;
    137 
    138  void Reset(Span<const char16_t> aText);
    139  Maybe<uint32_t> Next() override;
    140  Maybe<uint32_t> Seek(uint32_t aPos) override;
    141 
    142 private:
    143  icu4x::capi::WordSegmenter* mSegmenter = nullptr;
    144  icu4x::capi::WordBreakIteratorUtf16* mIterator = nullptr;
    145 };
    146 
    147 /**
    148 * Grapheme cluster break iterator for UTF-16 text.
    149 */
    150 class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
    151 public:
    152  explicit GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText);
    153  ~GraphemeClusterBreakIteratorUtf16() override;
    154 
    155  Maybe<uint32_t> Next() override;
    156  Maybe<uint32_t> Seek(uint32_t aPos) override;
    157 
    158 private:
    159  static icu4x::capi::GraphemeClusterSegmenter* sSegmenter;
    160  icu4x::capi::GraphemeClusterBreakIteratorUtf16* mIterator = nullptr;
    161 };
    162 
    163 /**
    164 * Grapheme cluster break reverse iterator for UTF-16 text.
    165 *
    166 * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it
    167 * at your own risk.
    168 */
    169 class GraphemeClusterBreakReverseIteratorUtf16 final
    170    : public SegmentIteratorUtf16 {
    171 public:
    172  explicit GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText);
    173 
    174  Maybe<uint32_t> Next() override;
    175  Maybe<uint32_t> Seek(uint32_t aPos) override;
    176 };
    177 
    178 /**
    179 * Sentence break iterator for UTF-16 text.
    180 */
    181 class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 {
    182 public:
    183  explicit SentenceBreakIteratorUtf16(Span<const char16_t> aText);
    184  ~SentenceBreakIteratorUtf16() override;
    185 
    186  Maybe<uint32_t> Next() override;
    187  Maybe<uint32_t> Seek(uint32_t aPos) override;
    188 
    189 private:
    190  icu4x::capi::SentenceSegmenter* mSegmenter = nullptr;
    191  icu4x::capi::SentenceBreakIteratorUtf16* mIterator = nullptr;
    192 };
    193 
    194 /**
    195 * This component is a Mozilla-focused API for working with segmenters in
    196 * internationalization code.
    197 *
    198 * This is a factor class. Calling Segment() to create an iterator over a text
    199 * of given granularity.
    200 */
    201 class Segmenter final {
    202 public:
    203  // NOTE: aLocale is a no-op currently.
    204  static Result<UniquePtr<Segmenter>, ICUError> TryCreate(
    205      Span<const char> aLocale, const SegmenterOptions& aOptions);
    206 
    207  explicit Segmenter(Span<const char> aLocale, const SegmenterOptions& aOptions)
    208      : mOptions(aOptions) {}
    209 
    210  // Creates an iterator over aText of a given granularity in mOptions.
    211  UniquePtr<SegmentIteratorUtf16> Segment(Span<const char16_t> aText) const;
    212 
    213  // TODO: Implement an iterator for Latin1 text.
    214  // UniquePtr<SegmentIteratorLatin1> Segment(Span<const uint8_t> aText) const;
    215 
    216 private:
    217  SegmenterOptions mOptions;
    218 };
    219 
    220 }  // namespace mozilla::intl
    221 
    222 #endif