Segmenter.cpp (13683B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this file, 5 * You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* Classes to iterate over grapheme, word, sentence, or line. */ 8 9 #include "mozilla/intl/Segmenter.h" 10 11 #include "icu4x/GraphemeClusterSegmenter.hpp" 12 #include "icu4x/LineSegmenter.hpp" 13 #include "icu4x/SentenceSegmenter.hpp" 14 #include "icu4x/WordSegmenter.hpp" 15 #include "mozilla/ClearOnShutdown.h" 16 #include "mozilla/intl/LineBreaker.h" 17 #include "mozilla/intl/WordBreaker.h" 18 #include "mozilla/intl/UnicodeProperties.h" 19 #include "mozilla/StaticPrefs_intl.h" 20 #include "nsUnicodeProperties.h" 21 #include "nsCharTraits.h" 22 #include "nsThreadUtils.h" 23 24 #include <mutex> 25 26 using namespace icu4x; 27 using namespace mozilla::unicode; 28 29 namespace mozilla::intl { 30 31 SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText) 32 : mText(aText) {} 33 34 Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) { 35 if (mPos < aPos) { 36 mPos = aPos; 37 } 38 return Next(); 39 } 40 41 LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText, 42 const LineBreakOptions& aOptions) 43 : SegmentIteratorUtf16(aText), mOptions(aOptions) { 44 if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { 45 return; 46 } 47 mSegmenter = capi::icu4x_LineSegmenter_create_auto_mv1(); 48 mIterator = capi::icu4x_LineSegmenter_segment_utf16_mv1( 49 mSegmenter, {mText.Elements(), mText.Length()}); 50 } 51 52 LineBreakIteratorUtf16::~LineBreakIteratorUtf16() { 53 if (mIterator) { 54 capi::icu4x_LineBreakIteratorUtf16_destroy_mv1(mIterator); 55 } 56 if (mSegmenter) { 57 capi::icu4x_LineSegmenter_destroy_mv1(mSegmenter); 58 } 59 } 60 61 Maybe<uint32_t> LineBreakIteratorUtf16::Next() { 62 if (mIterator) { 63 const int32_t nextPos = 64 capi::icu4x_LineBreakIteratorUtf16_next_mv1(mIterator); 65 if (nextPos < 0) { 66 return Nothing(); 67 } 68 if (!nextPos) { 69 return Next(); 70 } 71 mPos = nextPos; 72 return Some(mPos); 73 } 74 const int32_t nextPos = 75 LineBreaker::Next(mText.Elements(), mText.Length(), mPos); 76 if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { 77 return Nothing(); 78 } 79 mPos = nextPos; 80 return Some(mPos); 81 } 82 83 Maybe<uint32_t> LineBreakIteratorUtf16::Seek(uint32_t aPos) { 84 if (mIterator) { 85 if (mPos >= aPos) { 86 return Next(); 87 } 88 89 while (mPos < aPos) { 90 const int32_t nextPos = 91 capi::icu4x_LineBreakIteratorUtf16_next_mv1(mIterator); 92 if (nextPos < 0) { 93 return Nothing(); 94 } 95 mPos = static_cast<uint32_t>(nextPos); 96 } 97 98 if (aPos < mPos) { 99 return Some(mPos); 100 } 101 102 return Next(); 103 } 104 return SegmentIteratorUtf16::Seek(aPos); 105 } 106 107 WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText) 108 : SegmentIteratorUtf16(aText) { 109 if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { 110 return; 111 } 112 mSegmenter = capi::icu4x_WordSegmenter_create_auto_mv1(); 113 mIterator = capi::icu4x_WordSegmenter_segment_utf16_mv1( 114 mSegmenter, {mText.Elements(), mText.Length()}); 115 } 116 117 WordBreakIteratorUtf16::~WordBreakIteratorUtf16() { 118 if (mIterator) { 119 capi::icu4x_WordBreakIteratorUtf16_destroy_mv1(mIterator); 120 } 121 if (mSegmenter) { 122 capi::icu4x_WordSegmenter_destroy_mv1(mSegmenter); 123 } 124 } 125 126 void WordBreakIteratorUtf16::Reset(Span<const char16_t> aText) { 127 mPos = 0; 128 mText = aText; 129 if (mIterator) { 130 capi::icu4x_WordBreakIteratorUtf16_destroy_mv1(mIterator); 131 mIterator = nullptr; 132 } 133 if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { 134 return; 135 } 136 mIterator = capi::icu4x_WordSegmenter_segment_utf16_mv1( 137 mSegmenter, {mText.Elements(), mText.Length()}); 138 } 139 140 Maybe<uint32_t> WordBreakIteratorUtf16::Next() { 141 if (mIterator) { 142 const int32_t nextPos = 143 capi::icu4x_WordBreakIteratorUtf16_next_mv1(mIterator); 144 if (nextPos < 0) { 145 return Nothing(); 146 } 147 if (!nextPos) { 148 return Next(); 149 } 150 mPos = nextPos; 151 return Some(mPos); 152 } 153 const int32_t nextPos = 154 WordBreaker::Next(mText.Elements(), mText.Length(), mPos); 155 if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { 156 return Nothing(); 157 } 158 mPos = nextPos; 159 return Some(mPos); 160 } 161 162 Maybe<uint32_t> WordBreakIteratorUtf16::Seek(uint32_t aPos) { 163 if (mIterator) { 164 if (mPos >= aPos) { 165 return Next(); 166 } 167 168 while (mPos < aPos) { 169 const int32_t nextPos = 170 capi::icu4x_WordBreakIteratorUtf16_next_mv1(mIterator); 171 if (nextPos < 0) { 172 return Nothing(); 173 } 174 mPos = static_cast<uint32_t>(nextPos); 175 } 176 177 if (aPos < mPos) { 178 return Some(mPos); 179 } 180 181 return Next(); 182 } 183 return SegmentIteratorUtf16::Seek(aPos); 184 } 185 186 capi::GraphemeClusterSegmenter* GraphemeClusterBreakIteratorUtf16::sSegmenter = 187 nullptr; 188 189 GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( 190 Span<const char16_t> aText) 191 : SegmentIteratorUtf16(aText) { 192 if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { 193 return; 194 } 195 static std::once_flag sOnce; 196 197 std::call_once(sOnce, [] { 198 auto result = capi::icu4x_GraphemeClusterSegmenter_create_mv1(); 199 sSegmenter = result; 200 201 NS_DispatchToMainThread( 202 NS_NewRunnableFunction("GraphemeClusterBreakIteratorUtf16", [] { 203 RunOnShutdown([] { 204 capi::icu4x_GraphemeClusterSegmenter_destroy_mv1(sSegmenter); 205 sSegmenter = nullptr; 206 }); 207 })); 208 }); 209 210 MOZ_RELEASE_ASSERT(sSegmenter); 211 mIterator = capi::icu4x_GraphemeClusterSegmenter_segment_utf16_mv1( 212 sSegmenter, {mText.Elements(), mText.Length()}); 213 } 214 215 GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() { 216 if (mIterator) { 217 capi::icu4x_GraphemeClusterBreakIteratorUtf16_destroy_mv1(mIterator); 218 } 219 } 220 221 enum HSType { 222 HST_NONE = U_HST_NOT_APPLICABLE, 223 HST_L = U_HST_LEADING_JAMO, 224 HST_V = U_HST_VOWEL_JAMO, 225 HST_T = U_HST_TRAILING_JAMO, 226 HST_LV = U_HST_LV_SYLLABLE, 227 HST_LVT = U_HST_LVT_SYLLABLE 228 }; 229 230 static HSType GetHangulSyllableType(uint32_t aCh) { 231 return HSType(UnicodeProperties::GetIntPropertyValue( 232 aCh, UnicodeProperties::IntProperty::HangulSyllableType)); 233 } 234 235 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() { 236 const auto len = mText.Length(); 237 if (mIterator) { 238 const int32_t nextPos = 239 capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1(mIterator); 240 if (nextPos < 0) { 241 return Nothing(); 242 } 243 if (!nextPos) { 244 return Next(); 245 } 246 mPos = nextPos; 247 return Some(mPos); 248 } 249 if (mPos >= len) { 250 // The iterator has already reached the end. 251 return Nothing(); 252 } 253 254 uint32_t ch = mText[mPos++]; 255 256 if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) { 257 ch = SURROGATE_TO_UCS4(ch, mText[mPos++]); 258 } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || 259 (ch >= 0xac00 && ch <= 0xd7ff)) { 260 // Handle conjoining Jamo that make Hangul syllables 261 HSType hangulState = GetHangulSyllableType(ch); 262 while (mPos < len) { 263 ch = mText[mPos]; 264 HSType hangulType = GetHangulSyllableType(ch); 265 switch (hangulType) { 266 case HST_L: 267 case HST_LV: 268 case HST_LVT: 269 if (hangulState == HST_L) { 270 hangulState = hangulType; 271 mPos++; 272 continue; 273 } 274 break; 275 case HST_V: 276 if ((hangulState != HST_NONE) && (hangulState != HST_T) && 277 (hangulState != HST_LVT)) { 278 hangulState = hangulType; 279 mPos++; 280 continue; 281 } 282 break; 283 case HST_T: 284 if (hangulState != HST_NONE && hangulState != HST_L) { 285 hangulState = hangulType; 286 mPos++; 287 continue; 288 } 289 break; 290 default: 291 break; 292 } 293 break; 294 } 295 } 296 297 const uint32_t kVS16 = 0xfe0f; 298 const uint32_t kZWJ = 0x200d; 299 // UTF-16 surrogate values for Fitzpatrick type modifiers 300 const uint32_t kFitzpatrickHigh = 0xD83C; 301 const uint32_t kFitzpatrickLowFirst = 0xDFFB; 302 const uint32_t kFitzpatrickLowLast = 0xDFFF; 303 304 // Checking the emoji-presentation property of the base character is a bit 305 // expensive, so we do it lazily. 306 enum class EmojiStatus : uint8_t { 307 No, 308 Yes, 309 Unknown, 310 } baseIsEmojiStatus = EmojiStatus::Unknown; 311 312 // Remember the base character and the position of the next, in case we need 313 // to evaluate its emoji status. 314 uint32_t baseCh = ch; 315 uint32_t afterBase = mPos; 316 317 auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool { 318 return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh && 319 mText[aPos + 1] >= kFitzpatrickLowFirst && 320 mText[aPos + 1] <= kFitzpatrickLowLast; 321 }; 322 323 auto baseIsEmoji = [&]() -> bool { 324 if (baseIsEmojiStatus == EmojiStatus::Unknown) { 325 auto basePresentation = GetEmojiPresentation(baseCh); 326 baseIsEmojiStatus = 327 basePresentation == EmojiDefault || 328 (basePresentation == TextDefault && 329 ((afterBase < len && mText[afterBase] == kVS16) || 330 isFitzpatrickModifierAt(afterBase))) 331 ? EmojiStatus::Yes 332 : EmojiStatus::No; 333 } 334 return baseIsEmojiStatus == EmojiStatus::Yes; 335 }; 336 337 bool prevWasZwj = false; 338 339 while (mPos < len) { 340 ch = mText[mPos]; 341 size_t chLen = 1; 342 343 // Check for surrogate pairs; note that isolated surrogates will just 344 // be treated as generic (non-cluster-extending) characters here, 345 // which is fine for cluster-iterating purposes 346 if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) { 347 ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]); 348 chLen = 2; 349 } 350 351 bool extendCluster = 352 IsClusterExtender(ch) || 353 (prevWasZwj && baseIsEmoji() && 354 ((GetEmojiPresentation(ch) == EmojiDefault) || 355 (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len && 356 mText[mPos + chLen] == kVS16))); 357 if (!extendCluster) { 358 break; 359 } 360 361 prevWasZwj = (ch == kZWJ); 362 mPos += chLen; 363 } 364 365 MOZ_ASSERT(mPos <= len, "Next() has overshot the string!"); 366 return Some(mPos); 367 } 368 369 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) { 370 if (mIterator) { 371 if (mPos >= aPos) { 372 return Next(); 373 } 374 375 while (mPos < aPos) { 376 const int32_t nextPos = 377 capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1(mIterator); 378 if (nextPos < 0) { 379 return Nothing(); 380 } 381 mPos = static_cast<uint32_t>(nextPos); 382 } 383 384 if (aPos < mPos) { 385 return Some(mPos); 386 } 387 388 return Next(); 389 } 390 return SegmentIteratorUtf16::Seek(aPos); 391 } 392 393 GraphemeClusterBreakReverseIteratorUtf16:: 394 GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText) 395 : SegmentIteratorUtf16(aText) { 396 mPos = mText.Length(); 397 } 398 399 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() { 400 if (mPos == 0) { 401 return Nothing(); 402 } 403 404 uint32_t ch; 405 do { 406 ch = mText[--mPos]; 407 408 if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) { 409 ch = SURROGATE_TO_UCS4(mText[--mPos], ch); 410 } 411 412 if (!IsClusterExtender(ch)) { 413 break; 414 } 415 } while (mPos > 0); 416 417 // XXX May need to handle conjoining Jamo 418 419 return Some(mPos); 420 } 421 422 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { 423 if (mPos > aPos) { 424 mPos = aPos; 425 } 426 return Next(); 427 } 428 429 SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16( 430 Span<const char16_t> aText) 431 : SegmentIteratorUtf16(aText) { 432 mSegmenter = capi::icu4x_SentenceSegmenter_create_mv1(); 433 mIterator = capi::icu4x_SentenceSegmenter_segment_utf16_mv1( 434 mSegmenter, {mText.Elements(), mText.Length()}); 435 } 436 437 SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() { 438 if (mIterator) { 439 capi::icu4x_SentenceBreakIteratorUtf16_destroy_mv1(mIterator); 440 } 441 if (mSegmenter) { 442 capi::icu4x_SentenceSegmenter_destroy_mv1(mSegmenter); 443 } 444 } 445 446 Maybe<uint32_t> SentenceBreakIteratorUtf16::Seek(uint32_t aPos) { 447 if (!mIterator) { 448 return Nothing(); 449 } 450 451 if (mPos >= aPos) { 452 return Next(); 453 } 454 455 while (mPos < aPos) { 456 const int32_t nextPos = 457 capi::icu4x_SentenceBreakIteratorUtf16_next_mv1(mIterator); 458 if (nextPos < 0) { 459 return Nothing(); 460 } 461 mPos = static_cast<uint32_t>(nextPos); 462 } 463 464 if (aPos < mPos) { 465 return Some(mPos); 466 } 467 468 return Next(); 469 } 470 471 Maybe<uint32_t> SentenceBreakIteratorUtf16::Next() { 472 if (!mIterator) { 473 return Nothing(); 474 } 475 476 const int32_t nextPos = 477 capi::icu4x_SentenceBreakIteratorUtf16_next_mv1(mIterator); 478 if (nextPos < 0) { 479 return Nothing(); 480 } 481 if (!nextPos) { 482 return Next(); 483 } 484 mPos = nextPos; 485 return Some(mPos); 486 } 487 488 Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate( 489 Span<const char> aLocale, const SegmenterOptions& aOptions) { 490 return MakeUnique<Segmenter>(aLocale, aOptions); 491 } 492 493 UniquePtr<SegmentIteratorUtf16> Segmenter::Segment( 494 Span<const char16_t> aText) const { 495 switch (mOptions.mGranularity) { 496 case SegmenterGranularity::Grapheme: 497 return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText); 498 case SegmenterGranularity::Sentence: 499 return MakeUnique<SentenceBreakIteratorUtf16>(aText); 500 case SegmenterGranularity::Word: 501 return MakeUnique<WordBreakIteratorUtf16>(aText); 502 case SegmenterGranularity::Line: 503 return MakeUnique<LineBreakIteratorUtf16>(aText); 504 } 505 MOZ_ASSERT_UNREACHABLE("All granularities must be handled!"); 506 return nullptr; 507 } 508 509 } // namespace mozilla::intl