tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TestBreak.cpp (12510B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include <stdio.h>
      8 
      9 #include "gtest/gtest.h"
     10 #include "mozilla/intl/LineBreaker.h"
     11 #include "mozilla/intl/WordBreaker.h"
     12 #include "mozilla/Preferences.h"
     13 #include "mozilla/Span.h"
     14 #include "nsISupports.h"
     15 #include "nsServiceManagerUtils.h"
     16 #include "nsString.h"
     17 #include "nsTArray.h"
     18 #include "nsXPCOM.h"
     19 
     20 using mozilla::intl::LineBreaker;
     21 using mozilla::intl::WordBreaker;
     22 
     23 // Turn off clang-format to align the ruler comments to the test strings.
     24 
     25 // clang-format off
     26 static char teng0[] =
     27  //           1         2         3         4         5         6         7
     28  // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
     29    "hello world";
     30 // clang-format on
     31 
     32 static uint32_t lexp0[] = {5, 11};
     33 
     34 static uint32_t wexp0[] = {5, 6, 11};
     35 
     36 // clang-format off
     37 static char teng1[] =
     38  //           1         2         3         4         5         6         7
     39  // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
     40    "This is a test to test(reasonable) line    break. This 0.01123 = 45 x 48.";
     41 // clang-format on
     42 
     43 static uint32_t lexp1[] = {4,  7,  9,  14, 17, 34, 39, 40, 41,
     44                           42, 49, 54, 62, 64, 67, 69, 73};
     45 
     46 static uint32_t wexp1[] = {4,  5,  7,  8,  9,  10, 14, 15, 17, 18, 22, 23,
     47                           33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57,
     48                           62, 63, 64, 65, 67, 68, 69, 70, 72, 73};
     49 
     50 // clang-format off
     51 static char teng2[] =
     52  //           1         2         3         4         5         6         7
     53  // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
     54    "()((reasonab(l)e) line  break. .01123=45x48.";
     55 // clang-format on
     56 
     57 static uint32_t lexp2[] = {17, 22, 23, 30, 44};
     58 
     59 static uint32_t wexp2[] = {4,  12, 13, 14, 15, 16, 17, 18, 22,
     60                           24, 29, 30, 31, 32, 37, 38, 43, 44};
     61 
     62 // clang-format off
     63 static char teng3[] =
     64  //           1         2         3         4         5         6         7
     65  // 01234567890123456789012345678901234567890123456789012345678901234567890123456789
     66    "It's a test to test(ronae ) line break....";
     67 // clang-format on
     68 
     69 static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42};
     70 
     71 static uint32_t wexp3[] = {2,  3,  4,  5,  6,  7,  11, 12, 14, 15,
     72                           19, 20, 25, 26, 27, 28, 32, 33, 38, 42};
     73 
     74 static char ruler1[] =
     75    "          1         2         3         4         5         6         7  ";
     76 static char ruler2[] =
     77    "0123456789012345678901234567890123456789012345678901234567890123456789012";
     78 
     79 bool Check(const char* in, mozilla::Span<const uint32_t> out,
     80           mozilla::Span<const uint32_t> res) {
     81  const uint32_t outlen = out.Length();
     82  const uint32_t i = res.Length();
     83  bool ok = true;
     84 
     85  if (i != outlen) {
     86    ok = false;
     87    printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i);
     88  }
     89 
     90  for (uint32_t j = 0; j < i; j++) {
     91    if (j < outlen) {
     92      if (res[j] != out[j]) {
     93        ok = false;
     94        printf("[%d] expect %d but got %d\n", j, out[j], res[j]);
     95      }
     96    } else {
     97      ok = false;
     98      printf("[%d] additional %d\n", j, res[j]);
     99    }
    100  }
    101 
    102  if (!ok) {
    103    printf("string  = \n%s\n", in);
    104    printf("%s\n", ruler1);
    105    printf("%s\n", ruler2);
    106 
    107    printf("Expect = \n");
    108    for (uint32_t j = 0; j < outlen; j++) {
    109      printf("%d,", out[j]);
    110    }
    111 
    112    printf("\nResult = \n");
    113    for (uint32_t j = 0; j < i; j++) {
    114      printf("%d,", res[j]);
    115    }
    116    printf("\n");
    117  }
    118 
    119  return ok;
    120 }
    121 
    122 bool TestASCIILB(const char* in, mozilla::Span<const uint32_t> out) {
    123  NS_ConvertASCIItoUTF16 input(in);
    124  EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!";
    125 
    126  nsTArray<uint32_t> result;
    127  int32_t curr = 0;
    128  while (true) {
    129    curr = LineBreaker::Next(input.get(), input.Length(), curr);
    130    if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) {
    131      break;
    132    }
    133    result.AppendElement(curr);
    134  }
    135 
    136  return Check(in, out, result);
    137 }
    138 
    139 bool TestASCIIWB(const char* in, mozilla::Span<const uint32_t> out) {
    140  NS_ConvertASCIItoUTF16 input(in);
    141  EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!";
    142 
    143  nsTArray<uint32_t> result;
    144  int32_t curr = 0;
    145  while (true) {
    146    curr = WordBreaker::Next(input.get(), input.Length(), curr);
    147    if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) {
    148      break;
    149    }
    150    result.AppendElement(curr);
    151  }
    152 
    153  return Check(in, out, result);
    154 }
    155 
    156 TEST(LineBreak, LineBreaker)
    157 {
    158  ASSERT_TRUE(TestASCIILB(teng0, lexp0));
    159  ASSERT_TRUE(TestASCIILB(teng1, lexp1));
    160  ASSERT_TRUE(TestASCIILB(teng2, lexp2));
    161  ASSERT_TRUE(TestASCIILB(teng3, lexp3));
    162 }
    163 
    164 TEST(WordBreak, WordBreaker)
    165 {
    166  ASSERT_TRUE(TestASCIIWB(teng0, wexp0));
    167  ASSERT_TRUE(TestASCIIWB(teng1, wexp1));
    168  ASSERT_TRUE(TestASCIIWB(teng2, wexp2));
    169  ASSERT_TRUE(TestASCIIWB(teng3, wexp3));
    170 }
    171 
    172 //                         012345678901234
    173 static const char wb0[] = "T";
    174 static const char wb1[] = "h";
    175 static const char wb2[] = "";
    176 static const char wb3[] = "is   is a int";
    177 static const char wb4[] = "";
    178 static const char wb5[] = "";
    179 static const char wb6[] = "ernationali";
    180 static const char wb7[] = "zation work.";
    181 
    182 static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7};
    183 
    184 TEST(WordBreak, TestPrintWordWithBreak)
    185 {
    186  uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
    187 
    188  // This test generate the result string by appending '^' at every word break
    189  // opportunity except the one at end of the text.
    190  nsAutoString result;
    191 
    192  for (uint32_t i = 0; i < numOfFragment; i++) {
    193    NS_ConvertASCIItoUTF16 fragText(wb[i]);
    194 
    195    int32_t cur = 0;
    196    cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
    197    uint32_t start = 0;
    198    while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) {
    199      result.Append(Substring(fragText, start, cur - start));
    200 
    201      // Append '^' only if cur is within the fragText. We'll check the word
    202      // break opportunity between fragText and nextFragText using
    203      // BreakInBetween() below.
    204      if (cur < static_cast<int32_t>(fragText.Length())) {
    205        result.Append('^');
    206      }
    207      start = (cur >= 0 ? cur : cur - start);
    208      cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur);
    209    }
    210 
    211    if (i != numOfFragment - 1) {
    212      NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]);
    213      if (nextFragText.IsEmpty()) {
    214        // If nextFragText is empty, there's no new possible word break
    215        // opportunity.
    216        continue;
    217      }
    218 
    219      const auto origFragLen = static_cast<int32_t>(fragText.Length());
    220      fragText.Append(nextFragText);
    221 
    222      bool canBreak =
    223          origFragLen ==
    224          WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1);
    225      if (canBreak) {
    226        result.Append('^');
    227      }
    228    }
    229  }
    230  ASSERT_STREQ("This^   ^is^ ^a^ ^internationalization^ ^work^.",
    231               NS_ConvertUTF16toUTF8(result).get());
    232 }
    233 
    234 // This function searches a complete word starting from |offset| in wb[fragN].
    235 // If it reaches the end of wb[fragN], and there is no word break opportunity
    236 // between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1]
    237 // until a word break.
    238 void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset,
    239                                   const char* expected) {
    240  uint32_t numOfFragment = sizeof(wb) / sizeof(char*);
    241 
    242  NS_ConvertASCIItoUTF16 fragText(wb[fragN]);
    243 
    244  mozilla::intl::WordRange res = WordBreaker::FindWord(fragText, offset);
    245 
    246  nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin));
    247 
    248  if ((uint32_t)fragText.Length() <= res.mEnd) {
    249    // if we hit the end of the fragment
    250    nsAutoString curFragText = fragText;
    251    for (uint32_t p = fragN + 1; p < numOfFragment; p++) {
    252      NS_ConvertASCIItoUTF16 nextFragText(wb[p]);
    253      if (nextFragText.IsEmpty()) {
    254        // If nextFragText is empty, there's no new possible word break
    255        // opportunity between curFragText and nextFragText.
    256        continue;
    257      }
    258 
    259      const auto origFragLen = static_cast<int32_t>(curFragText.Length());
    260      curFragText.Append(nextFragText);
    261      bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(),
    262                                                       curFragText.Length(),
    263                                                       origFragLen - 1);
    264      if (canBreak) {
    265        break;
    266      }
    267      mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText, 0);
    268 
    269      result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin));
    270 
    271      if ((uint32_t)nextFragText.Length() != r.mEnd) {
    272        break;
    273      }
    274    }
    275  }
    276 
    277  ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get())
    278      << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")";
    279 }
    280 
    281 TEST(WordBreak, TestNextWordBreakWithComplexLanguage)
    282 {
    283  nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01");
    284 
    285  int32_t offset = 0;
    286  while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) {
    287    int32_t newOffset =
    288        WordBreaker::Next(fragText.get(), fragText.Length(), offset);
    289    ASSERT_NE(offset, newOffset);
    290    offset = newOffset;
    291  }
    292  ASSERT_TRUE(true);
    293 }
    294 
    295 TEST(WordBreak, TestFindWordWithEmptyString)
    296 {
    297  mozilla::intl::WordRange expect{0, 0};
    298  mozilla::intl::WordRange result = WordBreaker::FindWord(EmptyString(), 0);
    299  ASSERT_EQ(expect.mBegin, result.mBegin);
    300  ASSERT_EQ(expect.mEnd, result.mEnd);
    301 }
    302 
    303 TEST(WordBreak, TestNextWordBreakWithEmptyString)
    304 {
    305  char16_t empty[] = {};
    306  ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0));
    307  ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1));
    308 }
    309 
    310 TEST(WordBreak, TestFindWordBreakFromPosition)
    311 {
    312  TestFindWordBreakFromPosition(0, 0, "This");
    313  TestFindWordBreakFromPosition(1, 0, "his");
    314  TestFindWordBreakFromPosition(2, 0, "is");
    315  TestFindWordBreakFromPosition(3, 0, "is");
    316  TestFindWordBreakFromPosition(3, 1, "is");
    317  TestFindWordBreakFromPosition(3, 9, " ");
    318  TestFindWordBreakFromPosition(3, 10, "internationalization");
    319  TestFindWordBreakFromPosition(4, 0, "ernationalization");
    320  TestFindWordBreakFromPosition(5, 0, "ernationalization");
    321  TestFindWordBreakFromPosition(6, 4, "ernationalization");
    322  TestFindWordBreakFromPosition(6, 8, "ernationalization");
    323  TestFindWordBreakFromPosition(7, 6, " ");
    324  TestFindWordBreakFromPosition(7, 7, "work");
    325 }
    326 
    327 // Test for StopAtPunctuation option.
    328 TEST(WordBreak, TestFindBreakWithStopAtPunctuation)
    329 {
    330  bool original =
    331      mozilla::Preferences::GetBool("intl.icu4x.segmenter.enabled", true);
    332 
    333  // Not UAX#29 rule
    334  mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", false);
    335 
    336  nsString fragText(u"one.two");
    337 
    338  mozilla::intl::WordRange result1 = WordBreaker::FindWord(fragText, 0);
    339  ASSERT_EQ(0u, result1.mBegin);
    340  ASSERT_EQ(3u, result1.mEnd);
    341  mozilla::intl::WordRange result2 = WordBreaker::FindWord(fragText, 3);
    342  ASSERT_EQ(3u, result2.mBegin);
    343  ASSERT_EQ(4u, result2.mEnd);
    344  mozilla::intl::WordRange result3 = WordBreaker::FindWord(fragText, 4);
    345  ASSERT_EQ(4u, result3.mBegin);
    346  ASSERT_EQ(7u, result3.mEnd);
    347 
    348  // UAX#29 rule
    349  mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", true);
    350 
    351  mozilla::intl::WordRange result4 = WordBreaker::FindWord(
    352      fragText, 0, WordBreaker::FindWordOptions::StopAtPunctuation);
    353  ASSERT_EQ(0u, result4.mBegin);
    354  ASSERT_EQ(3u, result4.mEnd);
    355  mozilla::intl::WordRange result5 = WordBreaker::FindWord(
    356      fragText, 3, WordBreaker::FindWordOptions::StopAtPunctuation);
    357  ASSERT_EQ(3u, result5.mBegin);
    358  ASSERT_EQ(4u, result5.mEnd);
    359  mozilla::intl::WordRange result6 = WordBreaker::FindWord(
    360      fragText, 4, WordBreaker::FindWordOptions::StopAtPunctuation);
    361  ASSERT_EQ(4u, result6.mBegin);
    362  ASSERT_EQ(7u, result6.mEnd);
    363 
    364  // Default (without StopAtPunctuation)
    365  mozilla::intl::WordRange result7 = WordBreaker::FindWord(fragText, 0);
    366  ASSERT_EQ(0u, result7.mBegin);
    367  ASSERT_EQ(7u, result7.mEnd);
    368  mozilla::intl::WordRange result8 = WordBreaker::FindWord(fragText, 3);
    369  ASSERT_EQ(0u, result8.mBegin);
    370  ASSERT_EQ(7u, result8.mEnd);
    371  mozilla::intl::WordRange result9 = WordBreaker::FindWord(fragText, 4);
    372  ASSERT_EQ(0u, result9.mBegin);
    373  ASSERT_EQ(7u, result9.mEnd);
    374 
    375  mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", original);
    376 }