tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Segmenter.cpp (13683B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
      5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /* Classes to iterate over grapheme, word, sentence, or line. */
      8 
      9 #include "mozilla/intl/Segmenter.h"
     10 
     11 #include "icu4x/GraphemeClusterSegmenter.hpp"
     12 #include "icu4x/LineSegmenter.hpp"
     13 #include "icu4x/SentenceSegmenter.hpp"
     14 #include "icu4x/WordSegmenter.hpp"
     15 #include "mozilla/ClearOnShutdown.h"
     16 #include "mozilla/intl/LineBreaker.h"
     17 #include "mozilla/intl/WordBreaker.h"
     18 #include "mozilla/intl/UnicodeProperties.h"
     19 #include "mozilla/StaticPrefs_intl.h"
     20 #include "nsUnicodeProperties.h"
     21 #include "nsCharTraits.h"
     22 #include "nsThreadUtils.h"
     23 
     24 #include <mutex>
     25 
     26 using namespace icu4x;
     27 using namespace mozilla::unicode;
     28 
     29 namespace mozilla::intl {
     30 
     31 SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
     32    : mText(aText) {}
     33 
     34 Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
     35  if (mPos < aPos) {
     36    mPos = aPos;
     37  }
     38  return Next();
     39 }
     40 
     41 LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
     42                                               const LineBreakOptions& aOptions)
     43    : SegmentIteratorUtf16(aText), mOptions(aOptions) {
     44  if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
     45    return;
     46  }
     47  mSegmenter = capi::icu4x_LineSegmenter_create_auto_mv1();
     48  mIterator = capi::icu4x_LineSegmenter_segment_utf16_mv1(
     49      mSegmenter, {mText.Elements(), mText.Length()});
     50 }
     51 
     52 LineBreakIteratorUtf16::~LineBreakIteratorUtf16() {
     53  if (mIterator) {
     54    capi::icu4x_LineBreakIteratorUtf16_destroy_mv1(mIterator);
     55  }
     56  if (mSegmenter) {
     57    capi::icu4x_LineSegmenter_destroy_mv1(mSegmenter);
     58  }
     59 }
     60 
     61 Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
     62  if (mIterator) {
     63    const int32_t nextPos =
     64        capi::icu4x_LineBreakIteratorUtf16_next_mv1(mIterator);
     65    if (nextPos < 0) {
     66      return Nothing();
     67    }
     68    if (!nextPos) {
     69      return Next();
     70    }
     71    mPos = nextPos;
     72    return Some(mPos);
     73  }
     74  const int32_t nextPos =
     75      LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
     76  if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
     77    return Nothing();
     78  }
     79  mPos = nextPos;
     80  return Some(mPos);
     81 }
     82 
     83 Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) {
     84  if (mIterator) {
     85    if (mPos >= aPos) {
     86      return Next();
     87    }
     88 
     89    while (mPos < aPos) {
     90      const int32_t nextPos =
     91          capi::icu4x_LineBreakIteratorUtf16_next_mv1(mIterator);
     92      if (nextPos < 0) {
     93        return Nothing();
     94      }
     95      mPos = static_cast<uint32_t>(nextPos);
     96    }
     97 
     98    if (aPos < mPos) {
     99      return Some(mPos);
    100    }
    101 
    102    return Next();
    103  }
    104  return SegmentIteratorUtf16::Seek(aPos);
    105 }
    106 
    107 WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
    108    : SegmentIteratorUtf16(aText) {
    109  if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
    110    return;
    111  }
    112  mSegmenter = capi::icu4x_WordSegmenter_create_auto_mv1();
    113  mIterator = capi::icu4x_WordSegmenter_segment_utf16_mv1(
    114      mSegmenter, {mText.Elements(), mText.Length()});
    115 }
    116 
    117 WordBreakIteratorUtf16::~WordBreakIteratorUtf16() {
    118  if (mIterator) {
    119    capi::icu4x_WordBreakIteratorUtf16_destroy_mv1(mIterator);
    120  }
    121  if (mSegmenter) {
    122    capi::icu4x_WordSegmenter_destroy_mv1(mSegmenter);
    123  }
    124 }
    125 
    126 void WordBreakIteratorUtf16::Reset(Span<const char16_t> aText) {
    127  mPos = 0;
    128  mText = aText;
    129  if (mIterator) {
    130    capi::icu4x_WordBreakIteratorUtf16_destroy_mv1(mIterator);
    131    mIterator = nullptr;
    132  }
    133  if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
    134    return;
    135  }
    136  mIterator = capi::icu4x_WordSegmenter_segment_utf16_mv1(
    137      mSegmenter, {mText.Elements(), mText.Length()});
    138 }
    139 
    140 Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
    141  if (mIterator) {
    142    const int32_t nextPos =
    143        capi::icu4x_WordBreakIteratorUtf16_next_mv1(mIterator);
    144    if (nextPos < 0) {
    145      return Nothing();
    146    }
    147    if (!nextPos) {
    148      return Next();
    149    }
    150    mPos = nextPos;
    151    return Some(mPos);
    152  }
    153  const int32_t nextPos =
    154      WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
    155  if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
    156    return Nothing();
    157  }
    158  mPos = nextPos;
    159  return Some(mPos);
    160 }
    161 
    162 Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) {
    163  if (mIterator) {
    164    if (mPos >= aPos) {
    165      return Next();
    166    }
    167 
    168    while (mPos < aPos) {
    169      const int32_t nextPos =
    170          capi::icu4x_WordBreakIteratorUtf16_next_mv1(mIterator);
    171      if (nextPos < 0) {
    172        return Nothing();
    173      }
    174      mPos = static_cast<uint32_t>(nextPos);
    175    }
    176 
    177    if (aPos < mPos) {
    178      return Some(mPos);
    179    }
    180 
    181    return Next();
    182  }
    183  return SegmentIteratorUtf16::Seek(aPos);
    184 }
    185 
    186 capi::GraphemeClusterSegmenter* GraphemeClusterBreakIteratorUtf16::sSegmenter =
    187    nullptr;
    188 
    189 GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
    190    Span<const char16_t> aText)
    191    : SegmentIteratorUtf16(aText) {
    192  if (!StaticPrefs::intl_icu4x_segmenter_enabled()) {
    193    return;
    194  }
    195  static std::once_flag sOnce;
    196 
    197  std::call_once(sOnce, [] {
    198    auto result = capi::icu4x_GraphemeClusterSegmenter_create_mv1();
    199    sSegmenter = result;
    200 
    201    NS_DispatchToMainThread(
    202        NS_NewRunnableFunction("GraphemeClusterBreakIteratorUtf16", [] {
    203          RunOnShutdown([] {
    204            capi::icu4x_GraphemeClusterSegmenter_destroy_mv1(sSegmenter);
    205            sSegmenter = nullptr;
    206          });
    207        }));
    208  });
    209 
    210  MOZ_RELEASE_ASSERT(sSegmenter);
    211  mIterator = capi::icu4x_GraphemeClusterSegmenter_segment_utf16_mv1(
    212      sSegmenter, {mText.Elements(), mText.Length()});
    213 }
    214 
    215 GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() {
    216  if (mIterator) {
    217    capi::icu4x_GraphemeClusterBreakIteratorUtf16_destroy_mv1(mIterator);
    218  }
    219 }
    220 
    221 enum HSType {
    222  HST_NONE = U_HST_NOT_APPLICABLE,
    223  HST_L = U_HST_LEADING_JAMO,
    224  HST_V = U_HST_VOWEL_JAMO,
    225  HST_T = U_HST_TRAILING_JAMO,
    226  HST_LV = U_HST_LV_SYLLABLE,
    227  HST_LVT = U_HST_LVT_SYLLABLE
    228 };
    229 
    230 static HSType GetHangulSyllableType(uint32_t aCh) {
    231  return HSType(UnicodeProperties::GetIntPropertyValue(
    232      aCh, UnicodeProperties::IntProperty::HangulSyllableType));
    233 }
    234 
    235 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
    236  const auto len = mText.Length();
    237  if (mIterator) {
    238    const int32_t nextPos =
    239        capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1(mIterator);
    240    if (nextPos < 0) {
    241      return Nothing();
    242    }
    243    if (!nextPos) {
    244      return Next();
    245    }
    246    mPos = nextPos;
    247    return Some(mPos);
    248  }
    249  if (mPos >= len) {
    250    // The iterator has already reached the end.
    251    return Nothing();
    252  }
    253 
    254  uint32_t ch = mText[mPos++];
    255 
    256  if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
    257    ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
    258  } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
    259             (ch >= 0xac00 && ch <= 0xd7ff)) {
    260    // Handle conjoining Jamo that make Hangul syllables
    261    HSType hangulState = GetHangulSyllableType(ch);
    262    while (mPos < len) {
    263      ch = mText[mPos];
    264      HSType hangulType = GetHangulSyllableType(ch);
    265      switch (hangulType) {
    266        case HST_L:
    267        case HST_LV:
    268        case HST_LVT:
    269          if (hangulState == HST_L) {
    270            hangulState = hangulType;
    271            mPos++;
    272            continue;
    273          }
    274          break;
    275        case HST_V:
    276          if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
    277              (hangulState != HST_LVT)) {
    278            hangulState = hangulType;
    279            mPos++;
    280            continue;
    281          }
    282          break;
    283        case HST_T:
    284          if (hangulState != HST_NONE && hangulState != HST_L) {
    285            hangulState = hangulType;
    286            mPos++;
    287            continue;
    288          }
    289          break;
    290        default:
    291          break;
    292      }
    293      break;
    294    }
    295  }
    296 
    297  const uint32_t kVS16 = 0xfe0f;
    298  const uint32_t kZWJ = 0x200d;
    299  // UTF-16 surrogate values for Fitzpatrick type modifiers
    300  const uint32_t kFitzpatrickHigh = 0xD83C;
    301  const uint32_t kFitzpatrickLowFirst = 0xDFFB;
    302  const uint32_t kFitzpatrickLowLast = 0xDFFF;
    303 
    304  // Checking the emoji-presentation property of the base character is a bit
    305  // expensive, so we do it lazily.
    306  enum class EmojiStatus : uint8_t {
    307    No,
    308    Yes,
    309    Unknown,
    310  } baseIsEmojiStatus = EmojiStatus::Unknown;
    311 
    312  // Remember the base character and the position of the next, in case we need
    313  // to evaluate its emoji status.
    314  uint32_t baseCh = ch;
    315  uint32_t afterBase = mPos;
    316 
    317  auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool {
    318    return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh &&
    319           mText[aPos + 1] >= kFitzpatrickLowFirst &&
    320           mText[aPos + 1] <= kFitzpatrickLowLast;
    321  };
    322 
    323  auto baseIsEmoji = [&]() -> bool {
    324    if (baseIsEmojiStatus == EmojiStatus::Unknown) {
    325      auto basePresentation = GetEmojiPresentation(baseCh);
    326      baseIsEmojiStatus =
    327          basePresentation == EmojiDefault ||
    328                  (basePresentation == TextDefault &&
    329                   ((afterBase < len && mText[afterBase] == kVS16) ||
    330                    isFitzpatrickModifierAt(afterBase)))
    331              ? EmojiStatus::Yes
    332              : EmojiStatus::No;
    333    }
    334    return baseIsEmojiStatus == EmojiStatus::Yes;
    335  };
    336 
    337  bool prevWasZwj = false;
    338 
    339  while (mPos < len) {
    340    ch = mText[mPos];
    341    size_t chLen = 1;
    342 
    343    // Check for surrogate pairs; note that isolated surrogates will just
    344    // be treated as generic (non-cluster-extending) characters here,
    345    // which is fine for cluster-iterating purposes
    346    if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
    347      ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
    348      chLen = 2;
    349    }
    350 
    351    bool extendCluster =
    352        IsClusterExtender(ch) ||
    353        (prevWasZwj && baseIsEmoji() &&
    354         ((GetEmojiPresentation(ch) == EmojiDefault) ||
    355          (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
    356           mText[mPos + chLen] == kVS16)));
    357    if (!extendCluster) {
    358      break;
    359    }
    360 
    361    prevWasZwj = (ch == kZWJ);
    362    mPos += chLen;
    363  }
    364 
    365  MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
    366  return Some(mPos);
    367 }
    368 
    369 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) {
    370  if (mIterator) {
    371    if (mPos >= aPos) {
    372      return Next();
    373    }
    374 
    375    while (mPos < aPos) {
    376      const int32_t nextPos =
    377          capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1(mIterator);
    378      if (nextPos < 0) {
    379        return Nothing();
    380      }
    381      mPos = static_cast<uint32_t>(nextPos);
    382    }
    383 
    384    if (aPos < mPos) {
    385      return Some(mPos);
    386    }
    387 
    388    return Next();
    389  }
    390  return SegmentIteratorUtf16::Seek(aPos);
    391 }
    392 
    393 GraphemeClusterBreakReverseIteratorUtf16::
    394    GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
    395    : SegmentIteratorUtf16(aText) {
    396  mPos = mText.Length();
    397 }
    398 
    399 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
    400  if (mPos == 0) {
    401    return Nothing();
    402  }
    403 
    404  uint32_t ch;
    405  do {
    406    ch = mText[--mPos];
    407 
    408    if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
    409      ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
    410    }
    411 
    412    if (!IsClusterExtender(ch)) {
    413      break;
    414    }
    415  } while (mPos > 0);
    416 
    417  // XXX May need to handle conjoining Jamo
    418 
    419  return Some(mPos);
    420 }
    421 
    422 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
    423  if (mPos > aPos) {
    424    mPos = aPos;
    425  }
    426  return Next();
    427 }
    428 
    429 SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16(
    430    Span<const char16_t> aText)
    431    : SegmentIteratorUtf16(aText) {
    432  mSegmenter = capi::icu4x_SentenceSegmenter_create_mv1();
    433  mIterator = capi::icu4x_SentenceSegmenter_segment_utf16_mv1(
    434      mSegmenter, {mText.Elements(), mText.Length()});
    435 }
    436 
    437 SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() {
    438  if (mIterator) {
    439    capi::icu4x_SentenceBreakIteratorUtf16_destroy_mv1(mIterator);
    440  }
    441  if (mSegmenter) {
    442    capi::icu4x_SentenceSegmenter_destroy_mv1(mSegmenter);
    443  }
    444 }
    445 
    446 Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) {
    447  if (!mIterator) {
    448    return Nothing();
    449  }
    450 
    451  if (mPos >= aPos) {
    452    return Next();
    453  }
    454 
    455  while (mPos < aPos) {
    456    const int32_t nextPos =
    457        capi::icu4x_SentenceBreakIteratorUtf16_next_mv1(mIterator);
    458    if (nextPos < 0) {
    459      return Nothing();
    460    }
    461    mPos = static_cast<uint32_t>(nextPos);
    462  }
    463 
    464  if (aPos < mPos) {
    465    return Some(mPos);
    466  }
    467 
    468  return Next();
    469 }
    470 
    471 Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() {
    472  if (!mIterator) {
    473    return Nothing();
    474  }
    475 
    476  const int32_t nextPos =
    477      capi::icu4x_SentenceBreakIteratorUtf16_next_mv1(mIterator);
    478  if (nextPos < 0) {
    479    return Nothing();
    480  }
    481  if (!nextPos) {
    482    return Next();
    483  }
    484  mPos = nextPos;
    485  return Some(mPos);
    486 }
    487 
    488 Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
    489    Span<const char> aLocale, const SegmenterOptions& aOptions) {
    490  return MakeUnique<Segmenter>(aLocale, aOptions);
    491 }
    492 
    493 UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
    494    Span<const char16_t> aText) const {
    495  switch (mOptions.mGranularity) {
    496    case SegmenterGranularity::Grapheme:
    497      return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
    498    case SegmenterGranularity::Sentence:
    499      return MakeUnique<SentenceBreakIteratorUtf16>(aText);
    500    case SegmenterGranularity::Word:
    501      return MakeUnique<WordBreakIteratorUtf16>(aText);
    502    case SegmenterGranularity::Line:
    503      return MakeUnique<LineBreakIteratorUtf16>(aText);
    504  }
    505  MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
    506  return nullptr;
    507 }
    508 
    509 }  // namespace mozilla::intl