Segmenter.cpp (32319B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* Intl.Segmenter implementation. */ 8 9 #include "builtin/intl/Segmenter.h" 10 11 #include "mozilla/Assertions.h" 12 #include "mozilla/UniquePtr.h" 13 14 #include "jspubtd.h" 15 #include "NamespaceImports.h" 16 17 #include "builtin/Array.h" 18 #include "builtin/intl/CommonFunctions.h" 19 #include "builtin/intl/LocaleNegotiation.h" 20 #include "builtin/intl/StringAsciiChars.h" 21 #include "gc/AllocKind.h" 22 #include "gc/GCContext.h" 23 #include "icu4x/GraphemeClusterSegmenter.hpp" 24 #include "icu4x/Locale.hpp" 25 #include "icu4x/SentenceSegmenter.hpp" 26 #include "icu4x/WordSegmenter.hpp" 27 #include "js/CallArgs.h" 28 #include "js/PropertyDescriptor.h" 29 #include "js/PropertySpec.h" 30 #include "js/RootingAPI.h" 31 #include "js/StableStringChars.h" 32 #include "js/TypeDecls.h" 33 #include "js/Value.h" 34 #include "util/Unicode.h" 35 #include "vm/ArrayObject.h" 36 #include "vm/GlobalObject.h" 37 #include "vm/JSContext.h" 38 #include "vm/PlainObject.h" 39 #include "vm/WellKnownAtom.h" 40 41 #include "vm/JSObject-inl.h" 42 #include "vm/NativeObject-inl.h" 43 44 using namespace js; 45 using namespace js::intl; 46 47 const JSClassOps SegmenterObject::classOps_ = { 48 nullptr, // addProperty 49 nullptr, // delProperty 50 nullptr, // enumerate 51 nullptr, // newEnumerate 52 nullptr, // resolve 53 nullptr, // mayResolve 54 SegmenterObject::finalize, // finalize 55 nullptr, // call 56 nullptr, // construct 57 nullptr, // trace 58 }; 59 60 const JSClass SegmenterObject::class_ = { 61 "Intl.Segmenter", 62 JSCLASS_HAS_RESERVED_SLOTS(SegmenterObject::SLOT_COUNT) | 63 JSCLASS_HAS_CACHED_PROTO(JSProto_Segmenter) | 64 JSCLASS_FOREGROUND_FINALIZE, 65 &SegmenterObject::classOps_, 66 &SegmenterObject::classSpec_, 67 }; 68 69 const JSClass& SegmenterObject::protoClass_ = PlainObject::class_; 70 71 static bool segmenter_supportedLocalesOf(JSContext* cx, unsigned argc, 72 Value* vp); 73 74 static bool segmenter_toSource(JSContext* cx, unsigned argc, Value* vp) { 75 CallArgs args = CallArgsFromVp(argc, vp); 76 args.rval().setString(cx->names().Segmenter); 77 return true; 78 } 79 80 static const JSFunctionSpec segmenter_static_methods[] = { 81 JS_FN("supportedLocalesOf", segmenter_supportedLocalesOf, 1, 0), 82 JS_FS_END, 83 }; 84 85 static const JSFunctionSpec segmenter_methods[] = { 86 JS_SELF_HOSTED_FN("resolvedOptions", "Intl_Segmenter_resolvedOptions", 0, 87 0), 88 JS_SELF_HOSTED_FN("segment", "Intl_Segmenter_segment", 1, 0), 89 JS_FN("toSource", segmenter_toSource, 0, 0), 90 JS_FS_END, 91 }; 92 93 static const JSPropertySpec segmenter_properties[] = { 94 JS_STRING_SYM_PS(toStringTag, "Intl.Segmenter", JSPROP_READONLY), 95 JS_PS_END, 96 }; 97 98 static bool Segmenter(JSContext* cx, unsigned argc, Value* vp); 99 100 const ClassSpec SegmenterObject::classSpec_ = { 101 GenericCreateConstructor<Segmenter, 0, gc::AllocKind::FUNCTION>, 102 GenericCreatePrototype<SegmenterObject>, 103 segmenter_static_methods, 104 nullptr, 105 segmenter_methods, 106 segmenter_properties, 107 nullptr, 108 ClassSpec::DontDefineConstructor, 109 }; 110 111 /** 112 * Intl.Segmenter ([ locales [ , options ]]) 113 */ 114 static bool Segmenter(JSContext* cx, unsigned argc, Value* vp) { 115 CallArgs args = CallArgsFromVp(argc, vp); 116 117 // Step 1. 118 if (!ThrowIfNotConstructing(cx, args, "Intl.Segmenter")) { 119 return false; 120 } 121 122 // Steps 2-3 (Inlined 9.1.14, OrdinaryCreateFromConstructor). 123 Rooted<JSObject*> proto(cx); 124 if (!GetPrototypeFromBuiltinConstructor(cx, args, JSProto_Segmenter, 125 &proto)) { 126 return false; 127 } 128 129 Rooted<SegmenterObject*> segmenter(cx); 130 segmenter = NewObjectWithClassProto<SegmenterObject>(cx, proto); 131 if (!segmenter) { 132 return false; 133 } 134 135 HandleValue locales = args.get(0); 136 HandleValue options = args.get(1); 137 138 // Steps 4-13. 139 if (!intl::InitializeObject(cx, segmenter, cx->names().InitializeSegmenter, 140 locales, options)) { 141 return false; 142 } 143 144 // Step 14. 145 args.rval().setObject(*segmenter); 146 return true; 147 } 148 149 const JSClassOps SegmentsObject::classOps_ = { 150 nullptr, // addProperty 151 nullptr, // delProperty 152 nullptr, // enumerate 153 nullptr, // newEnumerate 154 nullptr, // resolve 155 nullptr, // mayResolve 156 SegmentsObject::finalize, // finalize 157 nullptr, // call 158 nullptr, // construct 159 nullptr, // trace 160 }; 161 162 const JSClass SegmentsObject::class_ = { 163 "Intl.Segments", 164 JSCLASS_HAS_RESERVED_SLOTS(SegmentsObject::SLOT_COUNT) | 165 JSCLASS_FOREGROUND_FINALIZE, 166 &SegmentsObject::classOps_, 167 }; 168 169 static const JSFunctionSpec segments_methods[] = { 170 JS_SELF_HOSTED_FN("containing", "Intl_Segments_containing", 1, 0), 171 JS_SELF_HOSTED_SYM_FN(iterator, "Intl_Segments_iterator", 0, 0), 172 JS_FS_END, 173 }; 174 175 bool GlobalObject::initSegmentsProto(JSContext* cx, 176 Handle<GlobalObject*> global) { 177 Rooted<JSObject*> proto( 178 cx, GlobalObject::createBlankPrototype<PlainObject>(cx, global)); 179 if (!proto) { 180 return false; 181 } 182 183 if (!JS_DefineFunctions(cx, proto, segments_methods)) { 184 return false; 185 } 186 187 global->initBuiltinProto(ProtoKind::SegmentsProto, proto); 188 return true; 189 } 190 191 const JSClassOps SegmentIteratorObject::classOps_ = { 192 nullptr, // addProperty 193 nullptr, // delProperty 194 nullptr, // enumerate 195 nullptr, // newEnumerate 196 nullptr, // resolve 197 nullptr, // mayResolve 198 SegmentIteratorObject::finalize, // finalize 199 nullptr, // call 200 nullptr, // construct 201 nullptr, // trace 202 }; 203 204 const JSClass SegmentIteratorObject::class_ = { 205 "Intl.SegmentIterator", 206 JSCLASS_HAS_RESERVED_SLOTS(SegmentIteratorObject::SLOT_COUNT) | 207 JSCLASS_FOREGROUND_FINALIZE, 208 &SegmentIteratorObject::classOps_, 209 }; 210 211 static const JSFunctionSpec segment_iterator_methods[] = { 212 JS_SELF_HOSTED_FN("next", "Intl_SegmentIterator_next", 0, 0), 213 JS_FS_END, 214 }; 215 216 static const JSPropertySpec segment_iterator_properties[] = { 217 JS_STRING_SYM_PS(toStringTag, "Segmenter String Iterator", JSPROP_READONLY), 218 JS_PS_END, 219 }; 220 221 bool GlobalObject::initSegmentIteratorProto(JSContext* cx, 222 Handle<GlobalObject*> global) { 223 Rooted<JSObject*> iteratorProto( 224 cx, GlobalObject::getOrCreateIteratorPrototype(cx, global)); 225 if (!iteratorProto) { 226 return false; 227 } 228 229 Rooted<JSObject*> proto( 230 cx, GlobalObject::createBlankPrototypeInheriting<PlainObject>( 231 cx, iteratorProto)); 232 if (!proto) { 233 return false; 234 } 235 236 if (!JS_DefineFunctions(cx, proto, segment_iterator_methods)) { 237 return false; 238 } 239 240 if (!JS_DefineProperties(cx, proto, segment_iterator_properties)) { 241 return false; 242 } 243 244 global->initBuiltinProto(ProtoKind::SegmentIteratorProto, proto); 245 return true; 246 } 247 248 struct Boundaries { 249 // Start index of this segmentation boundary. 250 int32_t startIndex = 0; 251 252 // End index of this segmentation boundary. 253 int32_t endIndex = 0; 254 255 // |true| if the segment is word-like. (Only used for word segmentation.) 256 bool isWordLike = false; 257 }; 258 259 /** 260 * Find the segmentation boundary for the string character whose position is 261 * |index|. The end position of the last segment boundary is |previousIndex|. 262 */ 263 template <class T> 264 static Boundaries FindBoundaryFrom(const T& iter, int32_t previousIndex, 265 int32_t index) { 266 MOZ_ASSERT(previousIndex <= index, 267 "previous index must not exceed the search index"); 268 269 int32_t previous = previousIndex; 270 while (true) { 271 // Find the next possible break index. 272 int32_t next = iter.next(); 273 274 // If |next| is larger than the search index, we've found our segment end 275 // index. 276 if (next > index) { 277 return {previous, next, iter.isWordLike()}; 278 } 279 280 // Otherwise store |next| as the start index of the next segment, 281 previous = next; 282 } 283 } 284 285 // TODO: Consider switching to the ICU4X C++ headers when the C++ headers 286 // are in better shape: https://github.com/rust-diplomat/diplomat/issues/280 287 288 template <typename Interface> 289 class SegmenterBreakIteratorType { 290 typename Interface::BreakIterator* impl_; 291 292 public: 293 explicit SegmenterBreakIteratorType(void* impl) 294 : impl_(static_cast<typename Interface::BreakIterator*>(impl)) { 295 MOZ_ASSERT(impl); 296 } 297 298 int32_t next() const { return Interface::next(impl_); } 299 300 bool isWordLike() const { return Interface::isWordLike(impl_); } 301 }; 302 303 // Each SegmenterBreakIterator interface contains the following definitions: 304 // 305 // - BreakIterator: Type of the ICU4X break iterator. 306 // - Segmenter: Type of the ICU4X segmenter. 307 // - Char: Character type, either `JS::Latin1Char` or `char16_t`. 308 // - create: Static method to create a new instance of `BreakIterator`. 309 // - destroy: Static method to destroy an instance of `BreakIterator`. 310 // - next: Static method to fetch the next break iteration index. 311 // - isWordLike: Static method to determine if the current segment is word-like. 312 // 313 // 314 // Each Segmenter interface contains the following definitions: 315 // 316 // - Segmenter: Type of the ICU4X segmenter. 317 // - BreakIteratorLatin1: SegmenterBreakIterator interface to Latin1 strings. 318 // - BreakIteratorTwoByte: SegmenterBreakIterator interface to TwoByte strings. 319 // - create: Static method to create a new instance of `Segmenter`. 320 // - destroy: Static method to destroy an instance of `Segmenter`. 321 322 struct GraphemeClusterSegmenterBreakIteratorLatin1 { 323 using BreakIterator = icu4x::capi::GraphemeClusterBreakIteratorLatin1; 324 using Segmenter = icu4x::capi::GraphemeClusterSegmenter; 325 using Char = JS::Latin1Char; 326 using StringView = diplomat::capi::DiplomatU8View; 327 328 static constexpr auto& create = 329 icu4x::capi::icu4x_GraphemeClusterSegmenter_segment_latin1_mv1; 330 static constexpr auto& destroy = 331 icu4x::capi::icu4x_GraphemeClusterBreakIteratorLatin1_destroy_mv1; 332 static constexpr auto& next = 333 icu4x::capi::icu4x_GraphemeClusterBreakIteratorLatin1_next_mv1; 334 335 static bool isWordLike(const BreakIterator*) { return false; } 336 }; 337 338 struct GraphemeClusterSegmenterBreakIteratorTwoByte { 339 using BreakIterator = icu4x::capi::GraphemeClusterBreakIteratorUtf16; 340 using Segmenter = icu4x::capi::GraphemeClusterSegmenter; 341 using Char = char16_t; 342 using StringView = diplomat::capi::DiplomatString16View; 343 344 static constexpr auto& create = 345 icu4x::capi::icu4x_GraphemeClusterSegmenter_segment_utf16_mv1; 346 static constexpr auto& destroy = 347 icu4x::capi::icu4x_GraphemeClusterBreakIteratorUtf16_destroy_mv1; 348 static constexpr auto& next = 349 icu4x::capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1; 350 351 static bool isWordLike(const BreakIterator*) { return false; } 352 }; 353 354 struct GraphemeClusterSegmenter { 355 using Segmenter = icu4x::capi::GraphemeClusterSegmenter; 356 using BreakIteratorLatin1 = 357 SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorLatin1>; 358 using BreakIteratorTwoByte = 359 SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorTwoByte>; 360 361 static constexpr auto& create = 362 icu4x::capi::icu4x_GraphemeClusterSegmenter_create_mv1; 363 static constexpr auto& destroy = 364 icu4x::capi::icu4x_GraphemeClusterSegmenter_destroy_mv1; 365 }; 366 367 struct WordSegmenterBreakIteratorLatin1 { 368 using BreakIterator = icu4x::capi::WordBreakIteratorLatin1; 369 using Segmenter = icu4x::capi::WordSegmenter; 370 using Char = JS::Latin1Char; 371 using StringView = diplomat::capi::DiplomatU8View; 372 373 static constexpr auto& create = 374 icu4x::capi::icu4x_WordSegmenter_segment_latin1_mv1; 375 static constexpr auto& destroy = 376 icu4x::capi::icu4x_WordBreakIteratorLatin1_destroy_mv1; 377 static constexpr auto& next = 378 icu4x::capi::icu4x_WordBreakIteratorLatin1_next_mv1; 379 static constexpr auto& isWordLike = 380 icu4x::capi::icu4x_WordBreakIteratorLatin1_is_word_like_mv1; 381 }; 382 383 struct WordSegmenterBreakIteratorTwoByte { 384 using BreakIterator = icu4x::capi::WordBreakIteratorUtf16; 385 using Segmenter = icu4x::capi::WordSegmenter; 386 using Char = char16_t; 387 using StringView = diplomat::capi::DiplomatString16View; 388 389 static constexpr auto& create = 390 icu4x::capi::icu4x_WordSegmenter_segment_utf16_mv1; 391 static constexpr auto& destroy = 392 icu4x::capi::icu4x_WordBreakIteratorUtf16_destroy_mv1; 393 static constexpr auto& next = 394 icu4x::capi::icu4x_WordBreakIteratorUtf16_next_mv1; 395 static constexpr auto& isWordLike = 396 icu4x::capi::icu4x_WordBreakIteratorUtf16_is_word_like_mv1; 397 }; 398 399 struct WordSegmenter { 400 using Segmenter = icu4x::capi::WordSegmenter; 401 using BreakIteratorLatin1 = 402 SegmenterBreakIteratorType<WordSegmenterBreakIteratorLatin1>; 403 using BreakIteratorTwoByte = 404 SegmenterBreakIteratorType<WordSegmenterBreakIteratorTwoByte>; 405 406 static constexpr auto& create = 407 icu4x::capi::icu4x_WordSegmenter_create_auto_with_content_locale_mv1; 408 static constexpr auto& destroy = icu4x::capi::icu4x_WordSegmenter_destroy_mv1; 409 }; 410 411 struct SentenceSegmenterBreakIteratorLatin1 { 412 using BreakIterator = icu4x::capi::SentenceBreakIteratorLatin1; 413 using Segmenter = icu4x::capi::SentenceSegmenter; 414 using Char = JS::Latin1Char; 415 using StringView = diplomat::capi::DiplomatU8View; 416 417 static constexpr auto& create = 418 icu4x::capi::icu4x_SentenceSegmenter_segment_latin1_mv1; 419 static constexpr auto& destroy = 420 icu4x::capi::icu4x_SentenceBreakIteratorLatin1_destroy_mv1; 421 static constexpr auto& next = 422 icu4x::capi::icu4x_SentenceBreakIteratorLatin1_next_mv1; 423 424 static bool isWordLike(const BreakIterator*) { return false; } 425 }; 426 427 struct SentenceSegmenterBreakIteratorTwoByte { 428 using BreakIterator = icu4x::capi::SentenceBreakIteratorUtf16; 429 using Segmenter = icu4x::capi::SentenceSegmenter; 430 using Char = char16_t; 431 using StringView = diplomat::capi::DiplomatString16View; 432 433 static constexpr auto& create = 434 icu4x::capi::icu4x_SentenceSegmenter_segment_utf16_mv1; 435 static constexpr auto& destroy = 436 icu4x::capi::icu4x_SentenceBreakIteratorUtf16_destroy_mv1; 437 static constexpr auto& next = 438 icu4x::capi::icu4x_SentenceBreakIteratorUtf16_next_mv1; 439 440 static bool isWordLike(const BreakIterator*) { return false; } 441 }; 442 443 struct SentenceSegmenter { 444 using Segmenter = icu4x::capi::SentenceSegmenter; 445 using BreakIteratorLatin1 = 446 SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorLatin1>; 447 using BreakIteratorTwoByte = 448 SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorTwoByte>; 449 450 static constexpr auto& create = 451 icu4x::capi::icu4x_SentenceSegmenter_create_with_content_locale_mv1; 452 static constexpr auto& destroy = 453 icu4x::capi::icu4x_SentenceSegmenter_destroy_mv1; 454 }; 455 456 class ICU4XLocaleDeleter { 457 public: 458 void operator()(icu4x::capi::Locale* ptr) { 459 icu4x::capi::icu4x_Locale_destroy_mv1(ptr); 460 } 461 }; 462 463 using UniqueICU4XLocale = 464 mozilla::UniquePtr<icu4x::capi::Locale, ICU4XLocaleDeleter>; 465 466 static UniqueICU4XLocale CreateICU4XLocale(JSContext* cx, 467 Handle<JSString*> str) { 468 auto* linear = str->ensureLinear(cx); 469 if (!linear) { 470 return nullptr; 471 } 472 473 icu4x::capi::icu4x_Locale_from_string_mv1_result result{}; 474 { 475 intl::StringAsciiChars chars(linear); 476 if (!chars.init(cx)) { 477 return nullptr; 478 } 479 480 auto span = static_cast<mozilla::Span<const char>>(chars); 481 result = 482 icu4x::capi::icu4x_Locale_from_string_mv1({span.data(), span.size()}); 483 } 484 485 if (!result.is_ok) { 486 intl::ReportInternalError(cx); 487 return nullptr; 488 } 489 return UniqueICU4XLocale{result.ok}; 490 } 491 492 /** 493 * Create a new, locale-invariant ICU4X segmenter instance. 494 */ 495 template <typename Interface> 496 static typename Interface::Segmenter* CreateSegmenter() { 497 return Interface::create(); 498 } 499 500 /** 501 * Create a new ICU4X segmenter instance, tailored for |locale|. 502 */ 503 template <typename Interface> 504 static typename Interface::Segmenter* CreateSegmenter( 505 JSContext* cx, Handle<JSString*> locale) { 506 auto loc = CreateICU4XLocale(cx, locale); 507 if (!loc) { 508 return nullptr; 509 } 510 511 auto result = Interface::create(loc.get()); 512 if (!result.is_ok) { 513 intl::ReportInternalError(cx); 514 return nullptr; 515 } 516 return result.ok; 517 } 518 519 static bool EnsureInternalsResolved(JSContext* cx, 520 Handle<SegmenterObject*> segmenter) { 521 if (segmenter->getLocale()) { 522 return true; 523 } 524 525 Rooted<JS::Value> value(cx); 526 527 Rooted<JSObject*> internals(cx, intl::GetInternalsObject(cx, segmenter)); 528 if (!internals) { 529 return false; 530 } 531 532 if (!GetProperty(cx, internals, internals, cx->names().locale, &value)) { 533 return false; 534 } 535 Rooted<JSString*> locale(cx, value.toString()); 536 537 if (!GetProperty(cx, internals, internals, cx->names().granularity, &value)) { 538 return false; 539 } 540 541 SegmenterGranularity granularity; 542 { 543 JSLinearString* linear = value.toString()->ensureLinear(cx); 544 if (!linear) { 545 return false; 546 } 547 548 if (StringEqualsLiteral(linear, "grapheme")) { 549 granularity = SegmenterGranularity::Grapheme; 550 } else if (StringEqualsLiteral(linear, "word")) { 551 granularity = SegmenterGranularity::Word; 552 } else { 553 MOZ_ASSERT(StringEqualsLiteral(linear, "sentence")); 554 granularity = SegmenterGranularity::Sentence; 555 } 556 } 557 558 switch (granularity) { 559 case SegmenterGranularity::Grapheme: { 560 auto* seg = CreateSegmenter<GraphemeClusterSegmenter>(); 561 if (!seg) { 562 return false; 563 } 564 segmenter->setSegmenter(seg); 565 break; 566 } 567 case SegmenterGranularity::Word: { 568 auto* seg = CreateSegmenter<WordSegmenter>(cx, locale); 569 if (!seg) { 570 return false; 571 } 572 segmenter->setSegmenter(seg); 573 break; 574 } 575 case SegmenterGranularity::Sentence: { 576 auto* seg = CreateSegmenter<SentenceSegmenter>(cx, locale); 577 if (!seg) { 578 return false; 579 } 580 segmenter->setSegmenter(seg); 581 break; 582 } 583 } 584 585 segmenter->setLocale(locale); 586 segmenter->setGranularity(granularity); 587 588 return true; 589 } 590 591 /** 592 * Destroy an ICU4X segmenter instance. 593 */ 594 template <typename Interface> 595 static void DestroySegmenter(void* seg) { 596 auto* segmenter = static_cast<typename Interface::Segmenter*>(seg); 597 Interface::destroy(segmenter); 598 } 599 600 void SegmenterObject::finalize(JS::GCContext* gcx, JSObject* obj) { 601 MOZ_ASSERT(gcx->onMainThread()); 602 603 auto& segmenter = obj->as<SegmenterObject>(); 604 if (void* seg = segmenter.getSegmenter()) { 605 switch (segmenter.getGranularity()) { 606 case SegmenterGranularity::Grapheme: { 607 DestroySegmenter<GraphemeClusterSegmenter>(seg); 608 break; 609 } 610 case SegmenterGranularity::Word: { 611 DestroySegmenter<WordSegmenter>(seg); 612 break; 613 } 614 case SegmenterGranularity::Sentence: { 615 DestroySegmenter<SentenceSegmenter>(seg); 616 break; 617 } 618 } 619 } 620 } 621 622 /** 623 * Destroy an ICU4X break iterator instance. 624 */ 625 template <typename Interface> 626 static void DestroyBreakIterator(void* brk) { 627 auto* breakIterator = static_cast<typename Interface::BreakIterator*>(brk); 628 Interface::destroy(breakIterator); 629 } 630 631 /** 632 * Destroy the ICU4X break iterator attached to |segments|. 633 */ 634 template <typename T> 635 static void DestroyBreakIterator(const T* segments) { 636 void* brk = segments->getBreakIterator(); 637 MOZ_ASSERT(brk); 638 639 bool isLatin1 = segments->hasLatin1StringChars(); 640 641 switch (segments->getGranularity()) { 642 case SegmenterGranularity::Grapheme: { 643 if (isLatin1) { 644 DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>(brk); 645 } else { 646 DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>(brk); 647 } 648 break; 649 } 650 case SegmenterGranularity::Word: { 651 if (isLatin1) { 652 DestroyBreakIterator<WordSegmenterBreakIteratorLatin1>(brk); 653 } else { 654 DestroyBreakIterator<WordSegmenterBreakIteratorTwoByte>(brk); 655 } 656 break; 657 } 658 case SegmenterGranularity::Sentence: { 659 if (isLatin1) { 660 DestroyBreakIterator<SentenceSegmenterBreakIteratorLatin1>(brk); 661 } else { 662 DestroyBreakIterator<SentenceSegmenterBreakIteratorTwoByte>(brk); 663 } 664 break; 665 } 666 } 667 } 668 669 void SegmentsObject::finalize(JS::GCContext* gcx, JSObject* obj) { 670 MOZ_ASSERT(gcx->onMainThread()); 671 672 auto* segments = &obj->as<SegmentsObject>(); 673 674 if (auto chars = segments->getStringChars()) { 675 size_t length = segments->getString()->length(); 676 if (chars.has<JS::Latin1Char>()) { 677 intl::RemoveICUCellMemory(gcx, segments, length * sizeof(JS::Latin1Char)); 678 js_free(chars.data<JS::Latin1Char>()); 679 } else { 680 intl::RemoveICUCellMemory(gcx, segments, length * sizeof(char16_t)); 681 js_free(chars.data<char16_t>()); 682 } 683 } 684 685 if (segments->getBreakIterator()) { 686 DestroyBreakIterator(segments); 687 } 688 } 689 690 void SegmentIteratorObject::finalize(JS::GCContext* gcx, JSObject* obj) { 691 MOZ_ASSERT(gcx->onMainThread()); 692 693 auto* iterator = &obj->as<SegmentIteratorObject>(); 694 695 if (auto chars = iterator->getStringChars()) { 696 size_t length = iterator->getString()->length(); 697 if (chars.has<JS::Latin1Char>()) { 698 intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(JS::Latin1Char)); 699 js_free(chars.data<JS::Latin1Char>()); 700 } else { 701 intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(char16_t)); 702 js_free(chars.data<char16_t>()); 703 } 704 } 705 706 if (iterator->getBreakIterator()) { 707 DestroyBreakIterator(iterator); 708 } 709 } 710 711 template <typename Iterator, typename T> 712 static Boundaries FindBoundaryFrom(Handle<T*> segments, int32_t index) { 713 MOZ_ASSERT(0 <= index && uint32_t(index) < segments->getString()->length()); 714 715 Iterator iter(segments->getBreakIterator()); 716 return FindBoundaryFrom(iter, segments->getIndex(), index); 717 } 718 719 template <typename T> 720 static Boundaries GraphemeBoundaries(Handle<T*> segments, int32_t index) { 721 if (segments->hasLatin1StringChars()) { 722 return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorLatin1>( 723 segments, index); 724 } 725 return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorTwoByte>( 726 segments, index); 727 } 728 729 template <typename T> 730 static Boundaries WordBoundaries(Handle<T*> segments, int32_t index) { 731 if (segments->hasLatin1StringChars()) { 732 return FindBoundaryFrom<WordSegmenter::BreakIteratorLatin1>(segments, 733 index); 734 } 735 return FindBoundaryFrom<WordSegmenter::BreakIteratorTwoByte>(segments, index); 736 } 737 738 template <typename T> 739 static Boundaries SentenceBoundaries(Handle<T*> segments, int32_t index) { 740 if (segments->hasLatin1StringChars()) { 741 return FindBoundaryFrom<SentenceSegmenter::BreakIteratorLatin1>(segments, 742 index); 743 } 744 return FindBoundaryFrom<SentenceSegmenter::BreakIteratorTwoByte>(segments, 745 index); 746 } 747 748 /** 749 * Ensure the string characters have been copied into |segments| in preparation 750 * for passing the string characters to ICU4X. 751 */ 752 template <typename T> 753 static bool EnsureStringChars(JSContext* cx, Handle<T*> segments) { 754 if (segments->hasStringChars()) { 755 return true; 756 } 757 758 Rooted<JSLinearString*> string(cx, segments->getString()->ensureLinear(cx)); 759 if (!string) { 760 return false; 761 } 762 763 size_t length = string->length(); 764 765 JS::AutoCheckCannotGC nogc; 766 if (string->hasLatin1Chars()) { 767 auto chars = DuplicateString(cx, string->latin1Chars(nogc), length); 768 if (!chars) { 769 return false; 770 } 771 segments->setStringChars(SegmentsStringChars{chars.release()}); 772 773 intl::AddICUCellMemory(segments, length * sizeof(JS::Latin1Char)); 774 } else { 775 auto chars = DuplicateString(cx, string->twoByteChars(nogc), length); 776 if (!chars) { 777 return false; 778 } 779 segments->setStringChars(SegmentsStringChars{chars.release()}); 780 781 intl::AddICUCellMemory(segments, length * sizeof(char16_t)); 782 } 783 return true; 784 } 785 786 /** 787 * Create a new ICU4X break iterator instance. 788 */ 789 template <typename Interface, typename T> 790 static auto* CreateBreakIterator(Handle<T*> segments) { 791 void* segmenter = segments->getSegmenter()->getSegmenter(); 792 MOZ_ASSERT(segmenter); 793 794 auto chars = segments->getStringChars(); 795 MOZ_ASSERT(chars); 796 797 size_t length = segments->getString()->length(); 798 799 auto* seg = static_cast<const typename Interface::Segmenter*>(segmenter); 800 auto* ch = chars.template data<typename Interface::Char>(); 801 typename Interface::StringView view{ch, length}; 802 return Interface::create(seg, view); 803 } 804 805 /** 806 * Ensure |segments| has a break iterator whose current segment index is at most 807 * |index|. 808 */ 809 template <typename T> 810 static bool EnsureBreakIterator(JSContext* cx, Handle<T*> segments, 811 int32_t index) { 812 if (segments->getBreakIterator()) { 813 // Reuse the break iterator if its current segment index is at most |index|. 814 if (index >= segments->getIndex()) { 815 return true; 816 } 817 818 // Reverse iteration not supported. Destroy the previous break iterator and 819 // start from fresh. 820 DestroyBreakIterator(segments.get()); 821 822 // Reset internal state. 823 segments->setBreakIterator(nullptr); 824 segments->setIndex(0); 825 } 826 827 // Ensure the string characters can be passed to ICU4X. 828 if (!EnsureStringChars(cx, segments)) { 829 return false; 830 } 831 832 bool isLatin1 = segments->hasLatin1StringChars(); 833 834 // Create a new break iterator based on the granularity and character type. 835 void* brk; 836 switch (segments->getGranularity()) { 837 case SegmenterGranularity::Grapheme: { 838 if (isLatin1) { 839 brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>( 840 segments); 841 } else { 842 brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>( 843 segments); 844 } 845 break; 846 } 847 case SegmenterGranularity::Word: { 848 if (isLatin1) { 849 brk = CreateBreakIterator<WordSegmenterBreakIteratorLatin1>(segments); 850 } else { 851 brk = CreateBreakIterator<WordSegmenterBreakIteratorTwoByte>(segments); 852 } 853 break; 854 } 855 case SegmenterGranularity::Sentence: { 856 if (isLatin1) { 857 brk = 858 CreateBreakIterator<SentenceSegmenterBreakIteratorLatin1>(segments); 859 } else { 860 brk = CreateBreakIterator<SentenceSegmenterBreakIteratorTwoByte>( 861 segments); 862 } 863 break; 864 } 865 } 866 867 MOZ_RELEASE_ASSERT(brk); 868 segments->setBreakIterator(brk); 869 870 MOZ_ASSERT(segments->getIndex() == 0, "index is initially zero"); 871 872 return true; 873 } 874 875 /** 876 * Create the boundaries result array for self-hosted code. 877 */ 878 static ArrayObject* CreateBoundaries(JSContext* cx, Boundaries boundaries, 879 SegmenterGranularity granularity) { 880 auto [startIndex, endIndex, isWordLike] = boundaries; 881 882 auto* result = NewDenseFullyAllocatedArray(cx, 3); 883 if (!result) { 884 return nullptr; 885 } 886 result->setDenseInitializedLength(3); 887 result->initDenseElement(0, Int32Value(startIndex)); 888 result->initDenseElement(1, Int32Value(endIndex)); 889 if (granularity == SegmenterGranularity::Word) { 890 result->initDenseElement(2, BooleanValue(isWordLike)); 891 } else { 892 result->initDenseElement(2, UndefinedValue()); 893 } 894 return result; 895 } 896 897 template <typename T> 898 static ArrayObject* FindSegmentBoundaries(JSContext* cx, Handle<T*> segments, 899 int32_t index) { 900 // Ensure break iteration can start at |index|. 901 if (!EnsureBreakIterator(cx, segments, index)) { 902 return nullptr; 903 } 904 905 // Find the actual segment boundaries. 906 Boundaries boundaries{}; 907 switch (segments->getGranularity()) { 908 case SegmenterGranularity::Grapheme: { 909 boundaries = GraphemeBoundaries(segments, index); 910 break; 911 } 912 case SegmenterGranularity::Word: { 913 boundaries = WordBoundaries(segments, index); 914 break; 915 } 916 case SegmenterGranularity::Sentence: { 917 boundaries = SentenceBoundaries(segments, index); 918 break; 919 } 920 } 921 922 // Remember the end index of the current boundary segment. 923 segments->setIndex(boundaries.endIndex); 924 925 return CreateBoundaries(cx, boundaries, segments->getGranularity()); 926 } 927 928 bool js::intl_CreateSegmentsObject(JSContext* cx, unsigned argc, Value* vp) { 929 CallArgs args = CallArgsFromVp(argc, vp); 930 MOZ_ASSERT(args.length() == 2); 931 932 Rooted<SegmenterObject*> segmenter(cx, 933 &args[0].toObject().as<SegmenterObject>()); 934 Rooted<JSString*> string(cx, args[1].toString()); 935 936 // Ensure the internal properties are resolved. 937 if (!EnsureInternalsResolved(cx, segmenter)) { 938 return false; 939 } 940 941 Rooted<JSObject*> proto( 942 cx, GlobalObject::getOrCreateSegmentsPrototype(cx, cx->global())); 943 if (!proto) { 944 return false; 945 } 946 947 auto* segments = NewObjectWithGivenProto<SegmentsObject>(cx, proto); 948 if (!segments) { 949 return false; 950 } 951 952 segments->setSegmenter(segmenter); 953 segments->setGranularity(segmenter->getGranularity()); 954 segments->setString(string); 955 segments->setIndex(0); 956 957 args.rval().setObject(*segments); 958 return true; 959 } 960 961 bool js::intl_CreateSegmentIterator(JSContext* cx, unsigned argc, Value* vp) { 962 CallArgs args = CallArgsFromVp(argc, vp); 963 MOZ_ASSERT(args.length() == 1); 964 965 Rooted<SegmentsObject*> segments(cx, 966 &args[0].toObject().as<SegmentsObject>()); 967 968 Rooted<JSObject*> proto( 969 cx, GlobalObject::getOrCreateSegmentIteratorPrototype(cx, cx->global())); 970 if (!proto) { 971 return false; 972 } 973 974 auto* iterator = NewObjectWithGivenProto<SegmentIteratorObject>(cx, proto); 975 if (!iterator) { 976 return false; 977 } 978 979 iterator->setSegmenter(segments->getSegmenter()); 980 iterator->setGranularity(segments->getGranularity()); 981 iterator->setString(segments->getString()); 982 iterator->setIndex(0); 983 984 args.rval().setObject(*iterator); 985 return true; 986 } 987 988 bool js::intl_FindSegmentBoundaries(JSContext* cx, unsigned argc, Value* vp) { 989 CallArgs args = CallArgsFromVp(argc, vp); 990 MOZ_ASSERT(args.length() == 2); 991 992 Rooted<SegmentsObject*> segments(cx, 993 &args[0].toObject().as<SegmentsObject>()); 994 995 int32_t index = args[1].toInt32(); 996 MOZ_ASSERT(index >= 0); 997 MOZ_ASSERT(uint32_t(index) < segments->getString()->length()); 998 999 auto* result = FindSegmentBoundaries( 1000 cx, static_cast<Handle<SegmentsObject*>>(segments), index); 1001 if (!result) { 1002 return false; 1003 } 1004 1005 args.rval().setObject(*result); 1006 return true; 1007 } 1008 1009 bool js::intl_FindNextSegmentBoundaries(JSContext* cx, unsigned argc, 1010 Value* vp) { 1011 CallArgs args = CallArgsFromVp(argc, vp); 1012 MOZ_ASSERT(args.length() == 1); 1013 1014 Rooted<SegmentIteratorObject*> iterator( 1015 cx, &args[0].toObject().as<SegmentIteratorObject>()); 1016 1017 int32_t index = iterator->getIndex(); 1018 MOZ_ASSERT(index >= 0); 1019 MOZ_ASSERT(uint32_t(index) < iterator->getString()->length()); 1020 1021 auto* result = FindSegmentBoundaries( 1022 cx, static_cast<Handle<SegmentIteratorObject*>>(iterator), index); 1023 if (!result) { 1024 return false; 1025 } 1026 1027 args.rval().setObject(*result); 1028 return true; 1029 } 1030 1031 /** 1032 * Intl.Segmenter.supportedLocalesOf ( locales [ , options ] ) 1033 */ 1034 static bool segmenter_supportedLocalesOf(JSContext* cx, unsigned argc, 1035 Value* vp) { 1036 CallArgs args = CallArgsFromVp(argc, vp); 1037 1038 // Steps 1-3. 1039 auto* array = SupportedLocalesOf(cx, AvailableLocaleKind::Segmenter, 1040 args.get(0), args.get(1)); 1041 if (!array) { 1042 return false; 1043 } 1044 args.rval().setObject(*array); 1045 return true; 1046 }