TestBreak.cpp (12510B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include <stdio.h> 8 9 #include "gtest/gtest.h" 10 #include "mozilla/intl/LineBreaker.h" 11 #include "mozilla/intl/WordBreaker.h" 12 #include "mozilla/Preferences.h" 13 #include "mozilla/Span.h" 14 #include "nsISupports.h" 15 #include "nsServiceManagerUtils.h" 16 #include "nsString.h" 17 #include "nsTArray.h" 18 #include "nsXPCOM.h" 19 20 using mozilla::intl::LineBreaker; 21 using mozilla::intl::WordBreaker; 22 23 // Turn off clang-format to align the ruler comments to the test strings. 24 25 // clang-format off 26 static char teng0[] = 27 // 1 2 3 4 5 6 7 28 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 29 "hello world"; 30 // clang-format on 31 32 static uint32_t lexp0[] = {5, 11}; 33 34 static uint32_t wexp0[] = {5, 6, 11}; 35 36 // clang-format off 37 static char teng1[] = 38 // 1 2 3 4 5 6 7 39 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 40 "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; 41 // clang-format on 42 43 static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41, 44 42, 49, 54, 62, 64, 67, 69, 73}; 45 46 static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, 23, 47 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57, 48 62, 63, 64, 65, 67, 68, 69, 70, 72, 73}; 49 50 // clang-format off 51 static char teng2[] = 52 // 1 2 3 4 5 6 7 53 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 54 "()((reasonab(l)e) line break. .01123=45x48."; 55 // clang-format on 56 57 static uint32_t lexp2[] = {17, 22, 23, 30, 44}; 58 59 static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22, 60 24, 29, 30, 31, 32, 37, 38, 43, 44}; 61 62 // clang-format off 63 static char teng3[] = 64 // 1 2 3 4 5 6 7 65 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 66 "It's a test to test(ronae ) line break...."; 67 // clang-format on 68 69 static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42}; 70 71 static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15, 72 19, 20, 25, 26, 27, 28, 32, 33, 38, 42}; 73 74 static char ruler1[] = 75 " 1 2 3 4 5 6 7 "; 76 static char ruler2[] = 77 "0123456789012345678901234567890123456789012345678901234567890123456789012"; 78 79 bool Check(const char* in, mozilla::Span<const uint32_t> out, 80 mozilla::Span<const uint32_t> res) { 81 const uint32_t outlen = out.Length(); 82 const uint32_t i = res.Length(); 83 bool ok = true; 84 85 if (i != outlen) { 86 ok = false; 87 printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i); 88 } 89 90 for (uint32_t j = 0; j < i; j++) { 91 if (j < outlen) { 92 if (res[j] != out[j]) { 93 ok = false; 94 printf("[%d] expect %d but got %d\n", j, out[j], res[j]); 95 } 96 } else { 97 ok = false; 98 printf("[%d] additional %d\n", j, res[j]); 99 } 100 } 101 102 if (!ok) { 103 printf("string = \n%s\n", in); 104 printf("%s\n", ruler1); 105 printf("%s\n", ruler2); 106 107 printf("Expect = \n"); 108 for (uint32_t j = 0; j < outlen; j++) { 109 printf("%d,", out[j]); 110 } 111 112 printf("\nResult = \n"); 113 for (uint32_t j = 0; j < i; j++) { 114 printf("%d,", res[j]); 115 } 116 printf("\n"); 117 } 118 119 return ok; 120 } 121 122 bool TestASCIILB(const char* in, mozilla::Span<const uint32_t> out) { 123 NS_ConvertASCIItoUTF16 input(in); 124 EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; 125 126 nsTArray<uint32_t> result; 127 int32_t curr = 0; 128 while (true) { 129 curr = LineBreaker::Next(input.get(), input.Length(), curr); 130 if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) { 131 break; 132 } 133 result.AppendElement(curr); 134 } 135 136 return Check(in, out, result); 137 } 138 139 bool TestASCIIWB(const char* in, mozilla::Span<const uint32_t> out) { 140 NS_ConvertASCIItoUTF16 input(in); 141 EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; 142 143 nsTArray<uint32_t> result; 144 int32_t curr = 0; 145 while (true) { 146 curr = WordBreaker::Next(input.get(), input.Length(), curr); 147 if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) { 148 break; 149 } 150 result.AppendElement(curr); 151 } 152 153 return Check(in, out, result); 154 } 155 156 TEST(LineBreak, LineBreaker) 157 { 158 ASSERT_TRUE(TestASCIILB(teng0, lexp0)); 159 ASSERT_TRUE(TestASCIILB(teng1, lexp1)); 160 ASSERT_TRUE(TestASCIILB(teng2, lexp2)); 161 ASSERT_TRUE(TestASCIILB(teng3, lexp3)); 162 } 163 164 TEST(WordBreak, WordBreaker) 165 { 166 ASSERT_TRUE(TestASCIIWB(teng0, wexp0)); 167 ASSERT_TRUE(TestASCIIWB(teng1, wexp1)); 168 ASSERT_TRUE(TestASCIIWB(teng2, wexp2)); 169 ASSERT_TRUE(TestASCIIWB(teng3, wexp3)); 170 } 171 172 // 012345678901234 173 static const char wb0[] = "T"; 174 static const char wb1[] = "h"; 175 static const char wb2[] = ""; 176 static const char wb3[] = "is is a int"; 177 static const char wb4[] = ""; 178 static const char wb5[] = ""; 179 static const char wb6[] = "ernationali"; 180 static const char wb7[] = "zation work."; 181 182 static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7}; 183 184 TEST(WordBreak, TestPrintWordWithBreak) 185 { 186 uint32_t numOfFragment = sizeof(wb) / sizeof(char*); 187 188 // This test generate the result string by appending '^' at every word break 189 // opportunity except the one at end of the text. 190 nsAutoString result; 191 192 for (uint32_t i = 0; i < numOfFragment; i++) { 193 NS_ConvertASCIItoUTF16 fragText(wb[i]); 194 195 int32_t cur = 0; 196 cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); 197 uint32_t start = 0; 198 while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) { 199 result.Append(Substring(fragText, start, cur - start)); 200 201 // Append '^' only if cur is within the fragText. We'll check the word 202 // break opportunity between fragText and nextFragText using 203 // BreakInBetween() below. 204 if (cur < static_cast<int32_t>(fragText.Length())) { 205 result.Append('^'); 206 } 207 start = (cur >= 0 ? cur : cur - start); 208 cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); 209 } 210 211 if (i != numOfFragment - 1) { 212 NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]); 213 if (nextFragText.IsEmpty()) { 214 // If nextFragText is empty, there's no new possible word break 215 // opportunity. 216 continue; 217 } 218 219 const auto origFragLen = static_cast<int32_t>(fragText.Length()); 220 fragText.Append(nextFragText); 221 222 bool canBreak = 223 origFragLen == 224 WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1); 225 if (canBreak) { 226 result.Append('^'); 227 } 228 } 229 } 230 ASSERT_STREQ("This^ ^is^ ^a^ ^internationalization^ ^work^.", 231 NS_ConvertUTF16toUTF8(result).get()); 232 } 233 234 // This function searches a complete word starting from |offset| in wb[fragN]. 235 // If it reaches the end of wb[fragN], and there is no word break opportunity 236 // between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1] 237 // until a word break. 238 void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, 239 const char* expected) { 240 uint32_t numOfFragment = sizeof(wb) / sizeof(char*); 241 242 NS_ConvertASCIItoUTF16 fragText(wb[fragN]); 243 244 mozilla::intl::WordRange res = WordBreaker::FindWord(fragText, offset); 245 246 nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin)); 247 248 if ((uint32_t)fragText.Length() <= res.mEnd) { 249 // if we hit the end of the fragment 250 nsAutoString curFragText = fragText; 251 for (uint32_t p = fragN + 1; p < numOfFragment; p++) { 252 NS_ConvertASCIItoUTF16 nextFragText(wb[p]); 253 if (nextFragText.IsEmpty()) { 254 // If nextFragText is empty, there's no new possible word break 255 // opportunity between curFragText and nextFragText. 256 continue; 257 } 258 259 const auto origFragLen = static_cast<int32_t>(curFragText.Length()); 260 curFragText.Append(nextFragText); 261 bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(), 262 curFragText.Length(), 263 origFragLen - 1); 264 if (canBreak) { 265 break; 266 } 267 mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText, 0); 268 269 result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); 270 271 if ((uint32_t)nextFragText.Length() != r.mEnd) { 272 break; 273 } 274 } 275 } 276 277 ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) 278 << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; 279 } 280 281 TEST(WordBreak, TestNextWordBreakWithComplexLanguage) 282 { 283 nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01"); 284 285 int32_t offset = 0; 286 while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) { 287 int32_t newOffset = 288 WordBreaker::Next(fragText.get(), fragText.Length(), offset); 289 ASSERT_NE(offset, newOffset); 290 offset = newOffset; 291 } 292 ASSERT_TRUE(true); 293 } 294 295 TEST(WordBreak, TestFindWordWithEmptyString) 296 { 297 mozilla::intl::WordRange expect{0, 0}; 298 mozilla::intl::WordRange result = WordBreaker::FindWord(EmptyString(), 0); 299 ASSERT_EQ(expect.mBegin, result.mBegin); 300 ASSERT_EQ(expect.mEnd, result.mEnd); 301 } 302 303 TEST(WordBreak, TestNextWordBreakWithEmptyString) 304 { 305 char16_t empty[] = {}; 306 ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0)); 307 ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1)); 308 } 309 310 TEST(WordBreak, TestFindWordBreakFromPosition) 311 { 312 TestFindWordBreakFromPosition(0, 0, "This"); 313 TestFindWordBreakFromPosition(1, 0, "his"); 314 TestFindWordBreakFromPosition(2, 0, "is"); 315 TestFindWordBreakFromPosition(3, 0, "is"); 316 TestFindWordBreakFromPosition(3, 1, "is"); 317 TestFindWordBreakFromPosition(3, 9, " "); 318 TestFindWordBreakFromPosition(3, 10, "internationalization"); 319 TestFindWordBreakFromPosition(4, 0, "ernationalization"); 320 TestFindWordBreakFromPosition(5, 0, "ernationalization"); 321 TestFindWordBreakFromPosition(6, 4, "ernationalization"); 322 TestFindWordBreakFromPosition(6, 8, "ernationalization"); 323 TestFindWordBreakFromPosition(7, 6, " "); 324 TestFindWordBreakFromPosition(7, 7, "work"); 325 } 326 327 // Test for StopAtPunctuation option. 328 TEST(WordBreak, TestFindBreakWithStopAtPunctuation) 329 { 330 bool original = 331 mozilla::Preferences::GetBool("intl.icu4x.segmenter.enabled", true); 332 333 // Not UAX#29 rule 334 mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", false); 335 336 nsString fragText(u"one.two"); 337 338 mozilla::intl::WordRange result1 = WordBreaker::FindWord(fragText, 0); 339 ASSERT_EQ(0u, result1.mBegin); 340 ASSERT_EQ(3u, result1.mEnd); 341 mozilla::intl::WordRange result2 = WordBreaker::FindWord(fragText, 3); 342 ASSERT_EQ(3u, result2.mBegin); 343 ASSERT_EQ(4u, result2.mEnd); 344 mozilla::intl::WordRange result3 = WordBreaker::FindWord(fragText, 4); 345 ASSERT_EQ(4u, result3.mBegin); 346 ASSERT_EQ(7u, result3.mEnd); 347 348 // UAX#29 rule 349 mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", true); 350 351 mozilla::intl::WordRange result4 = WordBreaker::FindWord( 352 fragText, 0, WordBreaker::FindWordOptions::StopAtPunctuation); 353 ASSERT_EQ(0u, result4.mBegin); 354 ASSERT_EQ(3u, result4.mEnd); 355 mozilla::intl::WordRange result5 = WordBreaker::FindWord( 356 fragText, 3, WordBreaker::FindWordOptions::StopAtPunctuation); 357 ASSERT_EQ(3u, result5.mBegin); 358 ASSERT_EQ(4u, result5.mEnd); 359 mozilla::intl::WordRange result6 = WordBreaker::FindWord( 360 fragText, 4, WordBreaker::FindWordOptions::StopAtPunctuation); 361 ASSERT_EQ(4u, result6.mBegin); 362 ASSERT_EQ(7u, result6.mEnd); 363 364 // Default (without StopAtPunctuation) 365 mozilla::intl::WordRange result7 = WordBreaker::FindWord(fragText, 0); 366 ASSERT_EQ(0u, result7.mBegin); 367 ASSERT_EQ(7u, result7.mEnd); 368 mozilla::intl::WordRange result8 = WordBreaker::FindWord(fragText, 3); 369 ASSERT_EQ(0u, result8.mBegin); 370 ASSERT_EQ(7u, result8.mEnd); 371 mozilla::intl::WordRange result9 = WordBreaker::FindWord(fragText, 4); 372 ASSERT_EQ(0u, result9.mBegin); 373 ASSERT_EQ(7u, result9.mEnd); 374 375 mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", original); 376 }