tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TestSegmenter.cpp (8553B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
      5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "gtest/gtest.h"
      8 
      9 #include "mozilla/intl/Segmenter.h"
     10 #include "mozilla/Preferences.h"
     11 
     12 namespace mozilla::intl {
     13 
     14 TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld)
     15 {
     16  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
     17  EXPECT_TRUE(rv == NS_OK);
     18 
     19  const SegmenterOptions options{SegmenterGranularity::Line};
     20  auto result = Segmenter::TryCreate("en", options);
     21  ASSERT_TRUE(result.isOk());
     22  auto lineSegmenter = result.unwrap();
     23 
     24  const char16_t text[] = u"hello world";
     25  UniquePtr<SegmentIteratorUtf16> segIter =
     26      lineSegmenter->Segment(MakeStringSpan(text));
     27 
     28  // Seek to space between "hello" and "world".
     29  ASSERT_EQ(segIter->Seek(5u), Some(11u));
     30 
     31  ASSERT_EQ(segIter->Next(), Nothing());
     32 
     33  // Same as calling Next().
     34  ASSERT_EQ(segIter->Seek(0u), Nothing());
     35 }
     36 
     37 TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek)
     38 {
     39  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
     40  EXPECT_TRUE(rv == NS_OK);
     41 
     42  const SegmenterOptions options{SegmenterGranularity::Line};
     43  auto result = Segmenter::TryCreate("en", options);
     44  ASSERT_TRUE(result.isOk());
     45  auto lineSegmenter = result.unwrap();
     46 
     47  const char16_t text[] = u"hello world";
     48  UniquePtr<SegmentIteratorUtf16> segIter =
     49      lineSegmenter->Segment(MakeStringSpan(text));
     50 
     51  // Seek to space between "hello" and "world".
     52  // UAX#14 rule returns before "w".
     53  ASSERT_EQ(segIter->Seek(5u), Some(6u));
     54 
     55  ASSERT_EQ(segIter->Next(), Some(11u));
     56 
     57  ASSERT_EQ(segIter->Next(), Nothing());
     58 
     59  // Same as calling Next().
     60  ASSERT_EQ(segIter->Seek(0u), Nothing());
     61 }
     62 
     63 TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple)
     64 {
     65  const SegmenterOptions options{SegmenterGranularity::Word};
     66  auto result = Segmenter::TryCreate("en", options);
     67  ASSERT_TRUE(result.isOk());
     68  auto wordSegmenter = result.unwrap();
     69 
     70  const char16_t text[] = u"hello world";
     71  UniquePtr<SegmentIteratorUtf16> segIter =
     72      wordSegmenter->Segment(MakeStringSpan(text));
     73 
     74  ASSERT_EQ(segIter->Next(), Some(5u));
     75  ASSERT_EQ(segIter->Next(), Some(6u));
     76  ASSERT_EQ(segIter->Next(), Some(11u));
     77  ASSERT_EQ(segIter->Next(), Nothing());
     78 }
     79 
     80 TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek)
     81 {
     82  const SegmenterOptions options{SegmenterGranularity::Word};
     83  auto result = Segmenter::TryCreate("en", options);
     84  ASSERT_TRUE(result.isOk());
     85  auto wordSegmenter = result.unwrap();
     86 
     87  const char16_t text[] = u"hello world";
     88  UniquePtr<SegmentIteratorUtf16> segIter =
     89      wordSegmenter->Segment(MakeStringSpan(text));
     90 
     91  // Seek to the space between "hello" and "world"
     92  ASSERT_EQ(segIter->Seek(5u), Some(6u));
     93 
     94  ASSERT_EQ(segIter->Next(), Some(11u));
     95  ASSERT_EQ(segIter->Next(), Nothing());
     96 
     97  // Same as calling Next().
     98  ASSERT_EQ(segIter->Seek(0u), Nothing());
     99 }
    100 
    101 TEST(IntlSegmenter, TestWordBreakIteratorUtf16ResetAndSeekOld)
    102 {
    103  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
    104  EXPECT_TRUE(rv == NS_OK);
    105 
    106  const SegmenterOptions options{SegmenterGranularity::Word};
    107  auto result = Segmenter::TryCreate("en", options);
    108  ASSERT_TRUE(result.isOk());
    109  auto wordSegmenter = result.unwrap();
    110 
    111  const char16_t text[] = u"hello world";
    112  UniquePtr<SegmentIteratorUtf16> segIter =
    113      wordSegmenter->Segment(MakeStringSpan(text));
    114 
    115  ASSERT_EQ(segIter->Next(), Some(5u));
    116  static_cast<WordBreakIteratorUtf16*>(segIter.get())
    117      ->Reset(MakeStringSpan(text));
    118  ASSERT_EQ(segIter->Next(), Some(5u));
    119  ASSERT_EQ(segIter->Next(), Some(6u));
    120  ASSERT_EQ(segIter->Next(), Some(11u));
    121 
    122  static_cast<WordBreakIteratorUtf16*>(segIter.get())
    123      ->Reset(MakeStringSpan(text));
    124  // Seek to space between "hello" and "world".
    125  ASSERT_EQ(segIter->Seek(5u), Some(6u));
    126 }
    127 
    128 TEST(IntlSegmenter, TestWordBreakIteratorUtf16ResetAndSeek)
    129 {
    130  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
    131  EXPECT_TRUE(rv == NS_OK);
    132 
    133  const SegmenterOptions options{SegmenterGranularity::Word};
    134  auto result = Segmenter::TryCreate("en", options);
    135  ASSERT_TRUE(result.isOk());
    136  auto wordSegmenter = result.unwrap();
    137 
    138  const char16_t text[] = u"hello world";
    139  UniquePtr<SegmentIteratorUtf16> segIter =
    140      wordSegmenter->Segment(MakeStringSpan(text));
    141 
    142  ASSERT_EQ(segIter->Next(), Some(5u));
    143  static_cast<WordBreakIteratorUtf16*>(segIter.get())
    144      ->Reset(MakeStringSpan(text));
    145  ASSERT_EQ(segIter->Next(), Some(5u));
    146  ASSERT_EQ(segIter->Next(), Some(6u));
    147  ASSERT_EQ(segIter->Next(), Some(11u));
    148 
    149  static_cast<WordBreakIteratorUtf16*>(segIter.get())
    150      ->Reset(MakeStringSpan(text));
    151  // Seek to space between "hello" and "world".
    152  ASSERT_EQ(segIter->Seek(5u), Some(6u));
    153 }
    154 
    155 TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple)
    156 {
    157  SegmenterOptions options{SegmenterGranularity::Grapheme};
    158  auto result = Segmenter::TryCreate("en", options);
    159  ASSERT_TRUE(result.isOk());
    160  auto graphemeClusterSegmenter = result.unwrap();
    161 
    162  const char16_t text[] = u"hello world";
    163  UniquePtr<SegmentIteratorUtf16> segIter =
    164      graphemeClusterSegmenter->Segment(MakeStringSpan(text));
    165 
    166  ASSERT_EQ(segIter->Next(), Some(1u));
    167  ASSERT_EQ(segIter->Next(), Some(2u));
    168  ASSERT_EQ(segIter->Next(), Some(3u));
    169  ASSERT_EQ(segIter->Next(), Some(4u));
    170  ASSERT_EQ(segIter->Next(), Some(5u));
    171  ASSERT_EQ(segIter->Next(), Some(6u));
    172  ASSERT_EQ(segIter->Next(), Some(7u));
    173  ASSERT_EQ(segIter->Next(), Some(8u));
    174  ASSERT_EQ(segIter->Next(), Some(9u));
    175  ASSERT_EQ(segIter->Next(), Some(10u));
    176  ASSERT_EQ(segIter->Next(), Some(11u));
    177  ASSERT_EQ(segIter->Next(), Nothing());
    178 }
    179 
    180 TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek)
    181 {
    182  SegmenterOptions options{SegmenterGranularity::Grapheme};
    183  auto result = Segmenter::TryCreate("en", options);
    184  ASSERT_TRUE(result.isOk());
    185  auto graphemeClusterSegmenter = result.unwrap();
    186 
    187  const char16_t text[] = u"hello world";
    188  UniquePtr<SegmentIteratorUtf16> segIter =
    189      graphemeClusterSegmenter->Segment(MakeStringSpan(text));
    190 
    191  // Seek to the space between "hello" and "world"
    192  ASSERT_EQ(segIter->Seek(5u), Some(6u));
    193 
    194  ASSERT_EQ(segIter->Next(), Some(7u));
    195  ASSERT_EQ(segIter->Next(), Some(8u));
    196  ASSERT_EQ(segIter->Next(), Some(9u));
    197  ASSERT_EQ(segIter->Next(), Some(10u));
    198  ASSERT_EQ(segIter->Next(), Some(11u));
    199  ASSERT_EQ(segIter->Next(), Nothing());
    200 
    201  // Same as calling Next().
    202  ASSERT_EQ(segIter->Seek(0u), Nothing());
    203 }
    204 
    205 TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16)
    206 {
    207  const char16_t text[] = u"hello world";
    208  GraphemeClusterBreakReverseIteratorUtf16 segIter(MakeStringSpan(text));
    209 
    210  // Seek to the space between "hello" and "world"
    211  ASSERT_EQ(segIter.Seek(6u), Some(5u));
    212 
    213  ASSERT_EQ(segIter.Next(), Some(4u));
    214  ASSERT_EQ(segIter.Next(), Some(3u));
    215  ASSERT_EQ(segIter.Next(), Some(2u));
    216  ASSERT_EQ(segIter.Next(), Some(1u));
    217  ASSERT_EQ(segIter.Next(), Some(0u));
    218  ASSERT_EQ(segIter.Next(), Nothing());
    219 
    220  // Same as calling Next().
    221  ASSERT_EQ(segIter.Seek(0u), Nothing());
    222 }
    223 
    224 TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16)
    225 {
    226  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
    227  EXPECT_TRUE(rv == NS_OK);
    228 
    229  SegmenterOptions options{SegmenterGranularity::Sentence};
    230  auto result = Segmenter::TryCreate("en", options);
    231  ASSERT_TRUE(result.isOk());
    232  auto sentenceSegmenter = result.unwrap();
    233 
    234  const char16_t text[] = u"Hello world. Hello world.";
    235  UniquePtr<SegmentIteratorUtf16> segIter =
    236      sentenceSegmenter->Segment(MakeStringSpan(text));
    237 
    238  ASSERT_EQ(segIter->Next(), Some(13u));
    239  ASSERT_EQ(segIter->Next(), Some(25u));
    240  ASSERT_EQ(segIter->Next(), Nothing());
    241 
    242  // Same as calling Next().
    243  ASSERT_EQ(segIter->Seek(0u), Nothing());
    244 }
    245 
    246 TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek)
    247 {
    248  nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
    249  EXPECT_TRUE(rv == NS_OK);
    250 
    251  SegmenterOptions options{SegmenterGranularity::Sentence};
    252  auto result = Segmenter::TryCreate("en", options);
    253  ASSERT_TRUE(result.isOk());
    254  auto sentenceSegmenter = result.unwrap();
    255 
    256  const char16_t text[] = u"Hello world. Hello world.";
    257  UniquePtr<SegmentIteratorUtf16> segIter =
    258      sentenceSegmenter->Segment(MakeStringSpan(text));
    259 
    260  ASSERT_EQ(segIter->Seek(5u), Some(13u));
    261 }
    262 
    263 }  // namespace mozilla::intl