tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Segmenter.cpp (32319B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /* Intl.Segmenter implementation. */
      8 
      9 #include "builtin/intl/Segmenter.h"
     10 
     11 #include "mozilla/Assertions.h"
     12 #include "mozilla/UniquePtr.h"
     13 
     14 #include "jspubtd.h"
     15 #include "NamespaceImports.h"
     16 
     17 #include "builtin/Array.h"
     18 #include "builtin/intl/CommonFunctions.h"
     19 #include "builtin/intl/LocaleNegotiation.h"
     20 #include "builtin/intl/StringAsciiChars.h"
     21 #include "gc/AllocKind.h"
     22 #include "gc/GCContext.h"
     23 #include "icu4x/GraphemeClusterSegmenter.hpp"
     24 #include "icu4x/Locale.hpp"
     25 #include "icu4x/SentenceSegmenter.hpp"
     26 #include "icu4x/WordSegmenter.hpp"
     27 #include "js/CallArgs.h"
     28 #include "js/PropertyDescriptor.h"
     29 #include "js/PropertySpec.h"
     30 #include "js/RootingAPI.h"
     31 #include "js/StableStringChars.h"
     32 #include "js/TypeDecls.h"
     33 #include "js/Value.h"
     34 #include "util/Unicode.h"
     35 #include "vm/ArrayObject.h"
     36 #include "vm/GlobalObject.h"
     37 #include "vm/JSContext.h"
     38 #include "vm/PlainObject.h"
     39 #include "vm/WellKnownAtom.h"
     40 
     41 #include "vm/JSObject-inl.h"
     42 #include "vm/NativeObject-inl.h"
     43 
     44 using namespace js;
     45 using namespace js::intl;
     46 
     47 const JSClassOps SegmenterObject::classOps_ = {
     48    nullptr,                    // addProperty
     49    nullptr,                    // delProperty
     50    nullptr,                    // enumerate
     51    nullptr,                    // newEnumerate
     52    nullptr,                    // resolve
     53    nullptr,                    // mayResolve
     54    SegmenterObject::finalize,  // finalize
     55    nullptr,                    // call
     56    nullptr,                    // construct
     57    nullptr,                    // trace
     58 };
     59 
     60 const JSClass SegmenterObject::class_ = {
     61    "Intl.Segmenter",
     62    JSCLASS_HAS_RESERVED_SLOTS(SegmenterObject::SLOT_COUNT) |
     63        JSCLASS_HAS_CACHED_PROTO(JSProto_Segmenter) |
     64        JSCLASS_FOREGROUND_FINALIZE,
     65    &SegmenterObject::classOps_,
     66    &SegmenterObject::classSpec_,
     67 };
     68 
     69 const JSClass& SegmenterObject::protoClass_ = PlainObject::class_;
     70 
     71 static bool segmenter_supportedLocalesOf(JSContext* cx, unsigned argc,
     72                                         Value* vp);
     73 
     74 static bool segmenter_toSource(JSContext* cx, unsigned argc, Value* vp) {
     75  CallArgs args = CallArgsFromVp(argc, vp);
     76  args.rval().setString(cx->names().Segmenter);
     77  return true;
     78 }
     79 
     80 static const JSFunctionSpec segmenter_static_methods[] = {
     81    JS_FN("supportedLocalesOf", segmenter_supportedLocalesOf, 1, 0),
     82    JS_FS_END,
     83 };
     84 
     85 static const JSFunctionSpec segmenter_methods[] = {
     86    JS_SELF_HOSTED_FN("resolvedOptions", "Intl_Segmenter_resolvedOptions", 0,
     87                      0),
     88    JS_SELF_HOSTED_FN("segment", "Intl_Segmenter_segment", 1, 0),
     89    JS_FN("toSource", segmenter_toSource, 0, 0),
     90    JS_FS_END,
     91 };
     92 
     93 static const JSPropertySpec segmenter_properties[] = {
     94    JS_STRING_SYM_PS(toStringTag, "Intl.Segmenter", JSPROP_READONLY),
     95    JS_PS_END,
     96 };
     97 
     98 static bool Segmenter(JSContext* cx, unsigned argc, Value* vp);
     99 
    100 const ClassSpec SegmenterObject::classSpec_ = {
    101    GenericCreateConstructor<Segmenter, 0, gc::AllocKind::FUNCTION>,
    102    GenericCreatePrototype<SegmenterObject>,
    103    segmenter_static_methods,
    104    nullptr,
    105    segmenter_methods,
    106    segmenter_properties,
    107    nullptr,
    108    ClassSpec::DontDefineConstructor,
    109 };
    110 
    111 /**
    112 * Intl.Segmenter ([ locales [ , options ]])
    113 */
    114 static bool Segmenter(JSContext* cx, unsigned argc, Value* vp) {
    115  CallArgs args = CallArgsFromVp(argc, vp);
    116 
    117  // Step 1.
    118  if (!ThrowIfNotConstructing(cx, args, "Intl.Segmenter")) {
    119    return false;
    120  }
    121 
    122  // Steps 2-3 (Inlined 9.1.14, OrdinaryCreateFromConstructor).
    123  Rooted<JSObject*> proto(cx);
    124  if (!GetPrototypeFromBuiltinConstructor(cx, args, JSProto_Segmenter,
    125                                          &proto)) {
    126    return false;
    127  }
    128 
    129  Rooted<SegmenterObject*> segmenter(cx);
    130  segmenter = NewObjectWithClassProto<SegmenterObject>(cx, proto);
    131  if (!segmenter) {
    132    return false;
    133  }
    134 
    135  HandleValue locales = args.get(0);
    136  HandleValue options = args.get(1);
    137 
    138  // Steps 4-13.
    139  if (!intl::InitializeObject(cx, segmenter, cx->names().InitializeSegmenter,
    140                              locales, options)) {
    141    return false;
    142  }
    143 
    144  // Step 14.
    145  args.rval().setObject(*segmenter);
    146  return true;
    147 }
    148 
    149 const JSClassOps SegmentsObject::classOps_ = {
    150    nullptr,                   // addProperty
    151    nullptr,                   // delProperty
    152    nullptr,                   // enumerate
    153    nullptr,                   // newEnumerate
    154    nullptr,                   // resolve
    155    nullptr,                   // mayResolve
    156    SegmentsObject::finalize,  // finalize
    157    nullptr,                   // call
    158    nullptr,                   // construct
    159    nullptr,                   // trace
    160 };
    161 
    162 const JSClass SegmentsObject::class_ = {
    163    "Intl.Segments",
    164    JSCLASS_HAS_RESERVED_SLOTS(SegmentsObject::SLOT_COUNT) |
    165        JSCLASS_FOREGROUND_FINALIZE,
    166    &SegmentsObject::classOps_,
    167 };
    168 
    169 static const JSFunctionSpec segments_methods[] = {
    170    JS_SELF_HOSTED_FN("containing", "Intl_Segments_containing", 1, 0),
    171    JS_SELF_HOSTED_SYM_FN(iterator, "Intl_Segments_iterator", 0, 0),
    172    JS_FS_END,
    173 };
    174 
    175 bool GlobalObject::initSegmentsProto(JSContext* cx,
    176                                     Handle<GlobalObject*> global) {
    177  Rooted<JSObject*> proto(
    178      cx, GlobalObject::createBlankPrototype<PlainObject>(cx, global));
    179  if (!proto) {
    180    return false;
    181  }
    182 
    183  if (!JS_DefineFunctions(cx, proto, segments_methods)) {
    184    return false;
    185  }
    186 
    187  global->initBuiltinProto(ProtoKind::SegmentsProto, proto);
    188  return true;
    189 }
    190 
    191 const JSClassOps SegmentIteratorObject::classOps_ = {
    192    nullptr,                          // addProperty
    193    nullptr,                          // delProperty
    194    nullptr,                          // enumerate
    195    nullptr,                          // newEnumerate
    196    nullptr,                          // resolve
    197    nullptr,                          // mayResolve
    198    SegmentIteratorObject::finalize,  // finalize
    199    nullptr,                          // call
    200    nullptr,                          // construct
    201    nullptr,                          // trace
    202 };
    203 
    204 const JSClass SegmentIteratorObject::class_ = {
    205    "Intl.SegmentIterator",
    206    JSCLASS_HAS_RESERVED_SLOTS(SegmentIteratorObject::SLOT_COUNT) |
    207        JSCLASS_FOREGROUND_FINALIZE,
    208    &SegmentIteratorObject::classOps_,
    209 };
    210 
    211 static const JSFunctionSpec segment_iterator_methods[] = {
    212    JS_SELF_HOSTED_FN("next", "Intl_SegmentIterator_next", 0, 0),
    213    JS_FS_END,
    214 };
    215 
    216 static const JSPropertySpec segment_iterator_properties[] = {
    217    JS_STRING_SYM_PS(toStringTag, "Segmenter String Iterator", JSPROP_READONLY),
    218    JS_PS_END,
    219 };
    220 
    221 bool GlobalObject::initSegmentIteratorProto(JSContext* cx,
    222                                            Handle<GlobalObject*> global) {
    223  Rooted<JSObject*> iteratorProto(
    224      cx, GlobalObject::getOrCreateIteratorPrototype(cx, global));
    225  if (!iteratorProto) {
    226    return false;
    227  }
    228 
    229  Rooted<JSObject*> proto(
    230      cx, GlobalObject::createBlankPrototypeInheriting<PlainObject>(
    231              cx, iteratorProto));
    232  if (!proto) {
    233    return false;
    234  }
    235 
    236  if (!JS_DefineFunctions(cx, proto, segment_iterator_methods)) {
    237    return false;
    238  }
    239 
    240  if (!JS_DefineProperties(cx, proto, segment_iterator_properties)) {
    241    return false;
    242  }
    243 
    244  global->initBuiltinProto(ProtoKind::SegmentIteratorProto, proto);
    245  return true;
    246 }
    247 
    248 struct Boundaries {
    249  // Start index of this segmentation boundary.
    250  int32_t startIndex = 0;
    251 
    252  // End index of this segmentation boundary.
    253  int32_t endIndex = 0;
    254 
    255  // |true| if the segment is word-like. (Only used for word segmentation.)
    256  bool isWordLike = false;
    257 };
    258 
    259 /**
    260 * Find the segmentation boundary for the string character whose position is
    261 * |index|. The end position of the last segment boundary is |previousIndex|.
    262 */
    263 template <class T>
    264 static Boundaries FindBoundaryFrom(const T& iter, int32_t previousIndex,
    265                                   int32_t index) {
    266  MOZ_ASSERT(previousIndex <= index,
    267             "previous index must not exceed the search index");
    268 
    269  int32_t previous = previousIndex;
    270  while (true) {
    271    // Find the next possible break index.
    272    int32_t next = iter.next();
    273 
    274    // If |next| is larger than the search index, we've found our segment end
    275    // index.
    276    if (next > index) {
    277      return {previous, next, iter.isWordLike()};
    278    }
    279 
    280    // Otherwise store |next| as the start index of the next segment,
    281    previous = next;
    282  }
    283 }
    284 
    285 // TODO: Consider switching to the ICU4X C++ headers when the C++ headers
    286 // are in better shape: https://github.com/rust-diplomat/diplomat/issues/280
    287 
    288 template <typename Interface>
    289 class SegmenterBreakIteratorType {
    290  typename Interface::BreakIterator* impl_;
    291 
    292 public:
    293  explicit SegmenterBreakIteratorType(void* impl)
    294      : impl_(static_cast<typename Interface::BreakIterator*>(impl)) {
    295    MOZ_ASSERT(impl);
    296  }
    297 
    298  int32_t next() const { return Interface::next(impl_); }
    299 
    300  bool isWordLike() const { return Interface::isWordLike(impl_); }
    301 };
    302 
    303 // Each SegmenterBreakIterator interface contains the following definitions:
    304 //
    305 // - BreakIterator: Type of the ICU4X break iterator.
    306 // - Segmenter: Type of the ICU4X segmenter.
    307 // - Char: Character type, either `JS::Latin1Char` or `char16_t`.
    308 // - create: Static method to create a new instance of `BreakIterator`.
    309 // - destroy: Static method to destroy an instance of `BreakIterator`.
    310 // - next: Static method to fetch the next break iteration index.
    311 // - isWordLike: Static method to determine if the current segment is word-like.
    312 //
    313 //
    314 // Each Segmenter interface contains the following definitions:
    315 //
    316 // - Segmenter: Type of the ICU4X segmenter.
    317 // - BreakIteratorLatin1: SegmenterBreakIterator interface to Latin1 strings.
    318 // - BreakIteratorTwoByte: SegmenterBreakIterator interface to TwoByte strings.
    319 // - create: Static method to create a new instance of `Segmenter`.
    320 // - destroy: Static method to destroy an instance of `Segmenter`.
    321 
    322 struct GraphemeClusterSegmenterBreakIteratorLatin1 {
    323  using BreakIterator = icu4x::capi::GraphemeClusterBreakIteratorLatin1;
    324  using Segmenter = icu4x::capi::GraphemeClusterSegmenter;
    325  using Char = JS::Latin1Char;
    326  using StringView = diplomat::capi::DiplomatU8View;
    327 
    328  static constexpr auto& create =
    329      icu4x::capi::icu4x_GraphemeClusterSegmenter_segment_latin1_mv1;
    330  static constexpr auto& destroy =
    331      icu4x::capi::icu4x_GraphemeClusterBreakIteratorLatin1_destroy_mv1;
    332  static constexpr auto& next =
    333      icu4x::capi::icu4x_GraphemeClusterBreakIteratorLatin1_next_mv1;
    334 
    335  static bool isWordLike(const BreakIterator*) { return false; }
    336 };
    337 
    338 struct GraphemeClusterSegmenterBreakIteratorTwoByte {
    339  using BreakIterator = icu4x::capi::GraphemeClusterBreakIteratorUtf16;
    340  using Segmenter = icu4x::capi::GraphemeClusterSegmenter;
    341  using Char = char16_t;
    342  using StringView = diplomat::capi::DiplomatString16View;
    343 
    344  static constexpr auto& create =
    345      icu4x::capi::icu4x_GraphemeClusterSegmenter_segment_utf16_mv1;
    346  static constexpr auto& destroy =
    347      icu4x::capi::icu4x_GraphemeClusterBreakIteratorUtf16_destroy_mv1;
    348  static constexpr auto& next =
    349      icu4x::capi::icu4x_GraphemeClusterBreakIteratorUtf16_next_mv1;
    350 
    351  static bool isWordLike(const BreakIterator*) { return false; }
    352 };
    353 
    354 struct GraphemeClusterSegmenter {
    355  using Segmenter = icu4x::capi::GraphemeClusterSegmenter;
    356  using BreakIteratorLatin1 =
    357      SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorLatin1>;
    358  using BreakIteratorTwoByte =
    359      SegmenterBreakIteratorType<GraphemeClusterSegmenterBreakIteratorTwoByte>;
    360 
    361  static constexpr auto& create =
    362      icu4x::capi::icu4x_GraphemeClusterSegmenter_create_mv1;
    363  static constexpr auto& destroy =
    364      icu4x::capi::icu4x_GraphemeClusterSegmenter_destroy_mv1;
    365 };
    366 
    367 struct WordSegmenterBreakIteratorLatin1 {
    368  using BreakIterator = icu4x::capi::WordBreakIteratorLatin1;
    369  using Segmenter = icu4x::capi::WordSegmenter;
    370  using Char = JS::Latin1Char;
    371  using StringView = diplomat::capi::DiplomatU8View;
    372 
    373  static constexpr auto& create =
    374      icu4x::capi::icu4x_WordSegmenter_segment_latin1_mv1;
    375  static constexpr auto& destroy =
    376      icu4x::capi::icu4x_WordBreakIteratorLatin1_destroy_mv1;
    377  static constexpr auto& next =
    378      icu4x::capi::icu4x_WordBreakIteratorLatin1_next_mv1;
    379  static constexpr auto& isWordLike =
    380      icu4x::capi::icu4x_WordBreakIteratorLatin1_is_word_like_mv1;
    381 };
    382 
    383 struct WordSegmenterBreakIteratorTwoByte {
    384  using BreakIterator = icu4x::capi::WordBreakIteratorUtf16;
    385  using Segmenter = icu4x::capi::WordSegmenter;
    386  using Char = char16_t;
    387  using StringView = diplomat::capi::DiplomatString16View;
    388 
    389  static constexpr auto& create =
    390      icu4x::capi::icu4x_WordSegmenter_segment_utf16_mv1;
    391  static constexpr auto& destroy =
    392      icu4x::capi::icu4x_WordBreakIteratorUtf16_destroy_mv1;
    393  static constexpr auto& next =
    394      icu4x::capi::icu4x_WordBreakIteratorUtf16_next_mv1;
    395  static constexpr auto& isWordLike =
    396      icu4x::capi::icu4x_WordBreakIteratorUtf16_is_word_like_mv1;
    397 };
    398 
    399 struct WordSegmenter {
    400  using Segmenter = icu4x::capi::WordSegmenter;
    401  using BreakIteratorLatin1 =
    402      SegmenterBreakIteratorType<WordSegmenterBreakIteratorLatin1>;
    403  using BreakIteratorTwoByte =
    404      SegmenterBreakIteratorType<WordSegmenterBreakIteratorTwoByte>;
    405 
    406  static constexpr auto& create =
    407      icu4x::capi::icu4x_WordSegmenter_create_auto_with_content_locale_mv1;
    408  static constexpr auto& destroy = icu4x::capi::icu4x_WordSegmenter_destroy_mv1;
    409 };
    410 
    411 struct SentenceSegmenterBreakIteratorLatin1 {
    412  using BreakIterator = icu4x::capi::SentenceBreakIteratorLatin1;
    413  using Segmenter = icu4x::capi::SentenceSegmenter;
    414  using Char = JS::Latin1Char;
    415  using StringView = diplomat::capi::DiplomatU8View;
    416 
    417  static constexpr auto& create =
    418      icu4x::capi::icu4x_SentenceSegmenter_segment_latin1_mv1;
    419  static constexpr auto& destroy =
    420      icu4x::capi::icu4x_SentenceBreakIteratorLatin1_destroy_mv1;
    421  static constexpr auto& next =
    422      icu4x::capi::icu4x_SentenceBreakIteratorLatin1_next_mv1;
    423 
    424  static bool isWordLike(const BreakIterator*) { return false; }
    425 };
    426 
    427 struct SentenceSegmenterBreakIteratorTwoByte {
    428  using BreakIterator = icu4x::capi::SentenceBreakIteratorUtf16;
    429  using Segmenter = icu4x::capi::SentenceSegmenter;
    430  using Char = char16_t;
    431  using StringView = diplomat::capi::DiplomatString16View;
    432 
    433  static constexpr auto& create =
    434      icu4x::capi::icu4x_SentenceSegmenter_segment_utf16_mv1;
    435  static constexpr auto& destroy =
    436      icu4x::capi::icu4x_SentenceBreakIteratorUtf16_destroy_mv1;
    437  static constexpr auto& next =
    438      icu4x::capi::icu4x_SentenceBreakIteratorUtf16_next_mv1;
    439 
    440  static bool isWordLike(const BreakIterator*) { return false; }
    441 };
    442 
    443 struct SentenceSegmenter {
    444  using Segmenter = icu4x::capi::SentenceSegmenter;
    445  using BreakIteratorLatin1 =
    446      SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorLatin1>;
    447  using BreakIteratorTwoByte =
    448      SegmenterBreakIteratorType<SentenceSegmenterBreakIteratorTwoByte>;
    449 
    450  static constexpr auto& create =
    451      icu4x::capi::icu4x_SentenceSegmenter_create_with_content_locale_mv1;
    452  static constexpr auto& destroy =
    453      icu4x::capi::icu4x_SentenceSegmenter_destroy_mv1;
    454 };
    455 
    456 class ICU4XLocaleDeleter {
    457 public:
    458  void operator()(icu4x::capi::Locale* ptr) {
    459    icu4x::capi::icu4x_Locale_destroy_mv1(ptr);
    460  }
    461 };
    462 
    463 using UniqueICU4XLocale =
    464    mozilla::UniquePtr<icu4x::capi::Locale, ICU4XLocaleDeleter>;
    465 
    466 static UniqueICU4XLocale CreateICU4XLocale(JSContext* cx,
    467                                           Handle<JSString*> str) {
    468  auto* linear = str->ensureLinear(cx);
    469  if (!linear) {
    470    return nullptr;
    471  }
    472 
    473  icu4x::capi::icu4x_Locale_from_string_mv1_result result{};
    474  {
    475    intl::StringAsciiChars chars(linear);
    476    if (!chars.init(cx)) {
    477      return nullptr;
    478    }
    479 
    480    auto span = static_cast<mozilla::Span<const char>>(chars);
    481    result =
    482        icu4x::capi::icu4x_Locale_from_string_mv1({span.data(), span.size()});
    483  }
    484 
    485  if (!result.is_ok) {
    486    intl::ReportInternalError(cx);
    487    return nullptr;
    488  }
    489  return UniqueICU4XLocale{result.ok};
    490 }
    491 
    492 /**
    493 * Create a new, locale-invariant ICU4X segmenter instance.
    494 */
    495 template <typename Interface>
    496 static typename Interface::Segmenter* CreateSegmenter() {
    497  return Interface::create();
    498 }
    499 
    500 /**
    501 * Create a new ICU4X segmenter instance, tailored for |locale|.
    502 */
    503 template <typename Interface>
    504 static typename Interface::Segmenter* CreateSegmenter(
    505    JSContext* cx, Handle<JSString*> locale) {
    506  auto loc = CreateICU4XLocale(cx, locale);
    507  if (!loc) {
    508    return nullptr;
    509  }
    510 
    511  auto result = Interface::create(loc.get());
    512  if (!result.is_ok) {
    513    intl::ReportInternalError(cx);
    514    return nullptr;
    515  }
    516  return result.ok;
    517 }
    518 
    519 static bool EnsureInternalsResolved(JSContext* cx,
    520                                    Handle<SegmenterObject*> segmenter) {
    521  if (segmenter->getLocale()) {
    522    return true;
    523  }
    524 
    525  Rooted<JS::Value> value(cx);
    526 
    527  Rooted<JSObject*> internals(cx, intl::GetInternalsObject(cx, segmenter));
    528  if (!internals) {
    529    return false;
    530  }
    531 
    532  if (!GetProperty(cx, internals, internals, cx->names().locale, &value)) {
    533    return false;
    534  }
    535  Rooted<JSString*> locale(cx, value.toString());
    536 
    537  if (!GetProperty(cx, internals, internals, cx->names().granularity, &value)) {
    538    return false;
    539  }
    540 
    541  SegmenterGranularity granularity;
    542  {
    543    JSLinearString* linear = value.toString()->ensureLinear(cx);
    544    if (!linear) {
    545      return false;
    546    }
    547 
    548    if (StringEqualsLiteral(linear, "grapheme")) {
    549      granularity = SegmenterGranularity::Grapheme;
    550    } else if (StringEqualsLiteral(linear, "word")) {
    551      granularity = SegmenterGranularity::Word;
    552    } else {
    553      MOZ_ASSERT(StringEqualsLiteral(linear, "sentence"));
    554      granularity = SegmenterGranularity::Sentence;
    555    }
    556  }
    557 
    558  switch (granularity) {
    559    case SegmenterGranularity::Grapheme: {
    560      auto* seg = CreateSegmenter<GraphemeClusterSegmenter>();
    561      if (!seg) {
    562        return false;
    563      }
    564      segmenter->setSegmenter(seg);
    565      break;
    566    }
    567    case SegmenterGranularity::Word: {
    568      auto* seg = CreateSegmenter<WordSegmenter>(cx, locale);
    569      if (!seg) {
    570        return false;
    571      }
    572      segmenter->setSegmenter(seg);
    573      break;
    574    }
    575    case SegmenterGranularity::Sentence: {
    576      auto* seg = CreateSegmenter<SentenceSegmenter>(cx, locale);
    577      if (!seg) {
    578        return false;
    579      }
    580      segmenter->setSegmenter(seg);
    581      break;
    582    }
    583  }
    584 
    585  segmenter->setLocale(locale);
    586  segmenter->setGranularity(granularity);
    587 
    588  return true;
    589 }
    590 
    591 /**
    592 * Destroy an ICU4X segmenter instance.
    593 */
    594 template <typename Interface>
    595 static void DestroySegmenter(void* seg) {
    596  auto* segmenter = static_cast<typename Interface::Segmenter*>(seg);
    597  Interface::destroy(segmenter);
    598 }
    599 
    600 void SegmenterObject::finalize(JS::GCContext* gcx, JSObject* obj) {
    601  MOZ_ASSERT(gcx->onMainThread());
    602 
    603  auto& segmenter = obj->as<SegmenterObject>();
    604  if (void* seg = segmenter.getSegmenter()) {
    605    switch (segmenter.getGranularity()) {
    606      case SegmenterGranularity::Grapheme: {
    607        DestroySegmenter<GraphemeClusterSegmenter>(seg);
    608        break;
    609      }
    610      case SegmenterGranularity::Word: {
    611        DestroySegmenter<WordSegmenter>(seg);
    612        break;
    613      }
    614      case SegmenterGranularity::Sentence: {
    615        DestroySegmenter<SentenceSegmenter>(seg);
    616        break;
    617      }
    618    }
    619  }
    620 }
    621 
    622 /**
    623 * Destroy an ICU4X break iterator instance.
    624 */
    625 template <typename Interface>
    626 static void DestroyBreakIterator(void* brk) {
    627  auto* breakIterator = static_cast<typename Interface::BreakIterator*>(brk);
    628  Interface::destroy(breakIterator);
    629 }
    630 
    631 /**
    632 * Destroy the ICU4X break iterator attached to |segments|.
    633 */
    634 template <typename T>
    635 static void DestroyBreakIterator(const T* segments) {
    636  void* brk = segments->getBreakIterator();
    637  MOZ_ASSERT(brk);
    638 
    639  bool isLatin1 = segments->hasLatin1StringChars();
    640 
    641  switch (segments->getGranularity()) {
    642    case SegmenterGranularity::Grapheme: {
    643      if (isLatin1) {
    644        DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>(brk);
    645      } else {
    646        DestroyBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>(brk);
    647      }
    648      break;
    649    }
    650    case SegmenterGranularity::Word: {
    651      if (isLatin1) {
    652        DestroyBreakIterator<WordSegmenterBreakIteratorLatin1>(brk);
    653      } else {
    654        DestroyBreakIterator<WordSegmenterBreakIteratorTwoByte>(brk);
    655      }
    656      break;
    657    }
    658    case SegmenterGranularity::Sentence: {
    659      if (isLatin1) {
    660        DestroyBreakIterator<SentenceSegmenterBreakIteratorLatin1>(brk);
    661      } else {
    662        DestroyBreakIterator<SentenceSegmenterBreakIteratorTwoByte>(brk);
    663      }
    664      break;
    665    }
    666  }
    667 }
    668 
    669 void SegmentsObject::finalize(JS::GCContext* gcx, JSObject* obj) {
    670  MOZ_ASSERT(gcx->onMainThread());
    671 
    672  auto* segments = &obj->as<SegmentsObject>();
    673 
    674  if (auto chars = segments->getStringChars()) {
    675    size_t length = segments->getString()->length();
    676    if (chars.has<JS::Latin1Char>()) {
    677      intl::RemoveICUCellMemory(gcx, segments, length * sizeof(JS::Latin1Char));
    678      js_free(chars.data<JS::Latin1Char>());
    679    } else {
    680      intl::RemoveICUCellMemory(gcx, segments, length * sizeof(char16_t));
    681      js_free(chars.data<char16_t>());
    682    }
    683  }
    684 
    685  if (segments->getBreakIterator()) {
    686    DestroyBreakIterator(segments);
    687  }
    688 }
    689 
    690 void SegmentIteratorObject::finalize(JS::GCContext* gcx, JSObject* obj) {
    691  MOZ_ASSERT(gcx->onMainThread());
    692 
    693  auto* iterator = &obj->as<SegmentIteratorObject>();
    694 
    695  if (auto chars = iterator->getStringChars()) {
    696    size_t length = iterator->getString()->length();
    697    if (chars.has<JS::Latin1Char>()) {
    698      intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(JS::Latin1Char));
    699      js_free(chars.data<JS::Latin1Char>());
    700    } else {
    701      intl::RemoveICUCellMemory(gcx, iterator, length * sizeof(char16_t));
    702      js_free(chars.data<char16_t>());
    703    }
    704  }
    705 
    706  if (iterator->getBreakIterator()) {
    707    DestroyBreakIterator(iterator);
    708  }
    709 }
    710 
    711 template <typename Iterator, typename T>
    712 static Boundaries FindBoundaryFrom(Handle<T*> segments, int32_t index) {
    713  MOZ_ASSERT(0 <= index && uint32_t(index) < segments->getString()->length());
    714 
    715  Iterator iter(segments->getBreakIterator());
    716  return FindBoundaryFrom(iter, segments->getIndex(), index);
    717 }
    718 
    719 template <typename T>
    720 static Boundaries GraphemeBoundaries(Handle<T*> segments, int32_t index) {
    721  if (segments->hasLatin1StringChars()) {
    722    return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorLatin1>(
    723        segments, index);
    724  }
    725  return FindBoundaryFrom<GraphemeClusterSegmenter::BreakIteratorTwoByte>(
    726      segments, index);
    727 }
    728 
    729 template <typename T>
    730 static Boundaries WordBoundaries(Handle<T*> segments, int32_t index) {
    731  if (segments->hasLatin1StringChars()) {
    732    return FindBoundaryFrom<WordSegmenter::BreakIteratorLatin1>(segments,
    733                                                                index);
    734  }
    735  return FindBoundaryFrom<WordSegmenter::BreakIteratorTwoByte>(segments, index);
    736 }
    737 
    738 template <typename T>
    739 static Boundaries SentenceBoundaries(Handle<T*> segments, int32_t index) {
    740  if (segments->hasLatin1StringChars()) {
    741    return FindBoundaryFrom<SentenceSegmenter::BreakIteratorLatin1>(segments,
    742                                                                    index);
    743  }
    744  return FindBoundaryFrom<SentenceSegmenter::BreakIteratorTwoByte>(segments,
    745                                                                   index);
    746 }
    747 
    748 /**
    749 * Ensure the string characters have been copied into |segments| in preparation
    750 * for passing the string characters to ICU4X.
    751 */
    752 template <typename T>
    753 static bool EnsureStringChars(JSContext* cx, Handle<T*> segments) {
    754  if (segments->hasStringChars()) {
    755    return true;
    756  }
    757 
    758  Rooted<JSLinearString*> string(cx, segments->getString()->ensureLinear(cx));
    759  if (!string) {
    760    return false;
    761  }
    762 
    763  size_t length = string->length();
    764 
    765  JS::AutoCheckCannotGC nogc;
    766  if (string->hasLatin1Chars()) {
    767    auto chars = DuplicateString(cx, string->latin1Chars(nogc), length);
    768    if (!chars) {
    769      return false;
    770    }
    771    segments->setStringChars(SegmentsStringChars{chars.release()});
    772 
    773    intl::AddICUCellMemory(segments, length * sizeof(JS::Latin1Char));
    774  } else {
    775    auto chars = DuplicateString(cx, string->twoByteChars(nogc), length);
    776    if (!chars) {
    777      return false;
    778    }
    779    segments->setStringChars(SegmentsStringChars{chars.release()});
    780 
    781    intl::AddICUCellMemory(segments, length * sizeof(char16_t));
    782  }
    783  return true;
    784 }
    785 
    786 /**
    787 * Create a new ICU4X break iterator instance.
    788 */
    789 template <typename Interface, typename T>
    790 static auto* CreateBreakIterator(Handle<T*> segments) {
    791  void* segmenter = segments->getSegmenter()->getSegmenter();
    792  MOZ_ASSERT(segmenter);
    793 
    794  auto chars = segments->getStringChars();
    795  MOZ_ASSERT(chars);
    796 
    797  size_t length = segments->getString()->length();
    798 
    799  auto* seg = static_cast<const typename Interface::Segmenter*>(segmenter);
    800  auto* ch = chars.template data<typename Interface::Char>();
    801  typename Interface::StringView view{ch, length};
    802  return Interface::create(seg, view);
    803 }
    804 
    805 /**
    806 * Ensure |segments| has a break iterator whose current segment index is at most
    807 * |index|.
    808 */
    809 template <typename T>
    810 static bool EnsureBreakIterator(JSContext* cx, Handle<T*> segments,
    811                                int32_t index) {
    812  if (segments->getBreakIterator()) {
    813    // Reuse the break iterator if its current segment index is at most |index|.
    814    if (index >= segments->getIndex()) {
    815      return true;
    816    }
    817 
    818    // Reverse iteration not supported. Destroy the previous break iterator and
    819    // start from fresh.
    820    DestroyBreakIterator(segments.get());
    821 
    822    // Reset internal state.
    823    segments->setBreakIterator(nullptr);
    824    segments->setIndex(0);
    825  }
    826 
    827  // Ensure the string characters can be passed to ICU4X.
    828  if (!EnsureStringChars(cx, segments)) {
    829    return false;
    830  }
    831 
    832  bool isLatin1 = segments->hasLatin1StringChars();
    833 
    834  // Create a new break iterator based on the granularity and character type.
    835  void* brk;
    836  switch (segments->getGranularity()) {
    837    case SegmenterGranularity::Grapheme: {
    838      if (isLatin1) {
    839        brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorLatin1>(
    840            segments);
    841      } else {
    842        brk = CreateBreakIterator<GraphemeClusterSegmenterBreakIteratorTwoByte>(
    843            segments);
    844      }
    845      break;
    846    }
    847    case SegmenterGranularity::Word: {
    848      if (isLatin1) {
    849        brk = CreateBreakIterator<WordSegmenterBreakIteratorLatin1>(segments);
    850      } else {
    851        brk = CreateBreakIterator<WordSegmenterBreakIteratorTwoByte>(segments);
    852      }
    853      break;
    854    }
    855    case SegmenterGranularity::Sentence: {
    856      if (isLatin1) {
    857        brk =
    858            CreateBreakIterator<SentenceSegmenterBreakIteratorLatin1>(segments);
    859      } else {
    860        brk = CreateBreakIterator<SentenceSegmenterBreakIteratorTwoByte>(
    861            segments);
    862      }
    863      break;
    864    }
    865  }
    866 
    867  MOZ_RELEASE_ASSERT(brk);
    868  segments->setBreakIterator(brk);
    869 
    870  MOZ_ASSERT(segments->getIndex() == 0, "index is initially zero");
    871 
    872  return true;
    873 }
    874 
    875 /**
    876 * Create the boundaries result array for self-hosted code.
    877 */
    878 static ArrayObject* CreateBoundaries(JSContext* cx, Boundaries boundaries,
    879                                     SegmenterGranularity granularity) {
    880  auto [startIndex, endIndex, isWordLike] = boundaries;
    881 
    882  auto* result = NewDenseFullyAllocatedArray(cx, 3);
    883  if (!result) {
    884    return nullptr;
    885  }
    886  result->setDenseInitializedLength(3);
    887  result->initDenseElement(0, Int32Value(startIndex));
    888  result->initDenseElement(1, Int32Value(endIndex));
    889  if (granularity == SegmenterGranularity::Word) {
    890    result->initDenseElement(2, BooleanValue(isWordLike));
    891  } else {
    892    result->initDenseElement(2, UndefinedValue());
    893  }
    894  return result;
    895 }
    896 
    897 template <typename T>
    898 static ArrayObject* FindSegmentBoundaries(JSContext* cx, Handle<T*> segments,
    899                                          int32_t index) {
    900  // Ensure break iteration can start at |index|.
    901  if (!EnsureBreakIterator(cx, segments, index)) {
    902    return nullptr;
    903  }
    904 
    905  // Find the actual segment boundaries.
    906  Boundaries boundaries{};
    907  switch (segments->getGranularity()) {
    908    case SegmenterGranularity::Grapheme: {
    909      boundaries = GraphemeBoundaries(segments, index);
    910      break;
    911    }
    912    case SegmenterGranularity::Word: {
    913      boundaries = WordBoundaries(segments, index);
    914      break;
    915    }
    916    case SegmenterGranularity::Sentence: {
    917      boundaries = SentenceBoundaries(segments, index);
    918      break;
    919    }
    920  }
    921 
    922  // Remember the end index of the current boundary segment.
    923  segments->setIndex(boundaries.endIndex);
    924 
    925  return CreateBoundaries(cx, boundaries, segments->getGranularity());
    926 }
    927 
    928 bool js::intl_CreateSegmentsObject(JSContext* cx, unsigned argc, Value* vp) {
    929  CallArgs args = CallArgsFromVp(argc, vp);
    930  MOZ_ASSERT(args.length() == 2);
    931 
    932  Rooted<SegmenterObject*> segmenter(cx,
    933                                     &args[0].toObject().as<SegmenterObject>());
    934  Rooted<JSString*> string(cx, args[1].toString());
    935 
    936  // Ensure the internal properties are resolved.
    937  if (!EnsureInternalsResolved(cx, segmenter)) {
    938    return false;
    939  }
    940 
    941  Rooted<JSObject*> proto(
    942      cx, GlobalObject::getOrCreateSegmentsPrototype(cx, cx->global()));
    943  if (!proto) {
    944    return false;
    945  }
    946 
    947  auto* segments = NewObjectWithGivenProto<SegmentsObject>(cx, proto);
    948  if (!segments) {
    949    return false;
    950  }
    951 
    952  segments->setSegmenter(segmenter);
    953  segments->setGranularity(segmenter->getGranularity());
    954  segments->setString(string);
    955  segments->setIndex(0);
    956 
    957  args.rval().setObject(*segments);
    958  return true;
    959 }
    960 
    961 bool js::intl_CreateSegmentIterator(JSContext* cx, unsigned argc, Value* vp) {
    962  CallArgs args = CallArgsFromVp(argc, vp);
    963  MOZ_ASSERT(args.length() == 1);
    964 
    965  Rooted<SegmentsObject*> segments(cx,
    966                                   &args[0].toObject().as<SegmentsObject>());
    967 
    968  Rooted<JSObject*> proto(
    969      cx, GlobalObject::getOrCreateSegmentIteratorPrototype(cx, cx->global()));
    970  if (!proto) {
    971    return false;
    972  }
    973 
    974  auto* iterator = NewObjectWithGivenProto<SegmentIteratorObject>(cx, proto);
    975  if (!iterator) {
    976    return false;
    977  }
    978 
    979  iterator->setSegmenter(segments->getSegmenter());
    980  iterator->setGranularity(segments->getGranularity());
    981  iterator->setString(segments->getString());
    982  iterator->setIndex(0);
    983 
    984  args.rval().setObject(*iterator);
    985  return true;
    986 }
    987 
    988 bool js::intl_FindSegmentBoundaries(JSContext* cx, unsigned argc, Value* vp) {
    989  CallArgs args = CallArgsFromVp(argc, vp);
    990  MOZ_ASSERT(args.length() == 2);
    991 
    992  Rooted<SegmentsObject*> segments(cx,
    993                                   &args[0].toObject().as<SegmentsObject>());
    994 
    995  int32_t index = args[1].toInt32();
    996  MOZ_ASSERT(index >= 0);
    997  MOZ_ASSERT(uint32_t(index) < segments->getString()->length());
    998 
    999  auto* result = FindSegmentBoundaries(
   1000      cx, static_cast<Handle<SegmentsObject*>>(segments), index);
   1001  if (!result) {
   1002    return false;
   1003  }
   1004 
   1005  args.rval().setObject(*result);
   1006  return true;
   1007 }
   1008 
   1009 bool js::intl_FindNextSegmentBoundaries(JSContext* cx, unsigned argc,
   1010                                        Value* vp) {
   1011  CallArgs args = CallArgsFromVp(argc, vp);
   1012  MOZ_ASSERT(args.length() == 1);
   1013 
   1014  Rooted<SegmentIteratorObject*> iterator(
   1015      cx, &args[0].toObject().as<SegmentIteratorObject>());
   1016 
   1017  int32_t index = iterator->getIndex();
   1018  MOZ_ASSERT(index >= 0);
   1019  MOZ_ASSERT(uint32_t(index) < iterator->getString()->length());
   1020 
   1021  auto* result = FindSegmentBoundaries(
   1022      cx, static_cast<Handle<SegmentIteratorObject*>>(iterator), index);
   1023  if (!result) {
   1024    return false;
   1025  }
   1026 
   1027  args.rval().setObject(*result);
   1028  return true;
   1029 }
   1030 
   1031 /**
   1032 * Intl.Segmenter.supportedLocalesOf ( locales [ , options ] )
   1033 */
   1034 static bool segmenter_supportedLocalesOf(JSContext* cx, unsigned argc,
   1035                                         Value* vp) {
   1036  CallArgs args = CallArgsFromVp(argc, vp);
   1037 
   1038  // Steps 1-3.
   1039  auto* array = SupportedLocalesOf(cx, AvailableLocaleKind::Segmenter,
   1040                                   args.get(0), args.get(1));
   1041  if (!array) {
   1042    return false;
   1043  }
   1044  args.rval().setObject(*array);
   1045  return true;
   1046 }