LineBreaker.cpp (52014B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "mozilla/intl/LineBreaker.h" 7 8 #include "diplomat_runtime.hpp" 9 #include "icu4x/LineBreakIteratorLatin1.hpp" 10 #include "icu4x/LineBreakIteratorUtf16.hpp" 11 #include "icu4x/LineSegmenter.hpp" 12 #include "icu4x/Locale.hpp" 13 #include "jisx4051class.h" 14 #include "LineBreakCache.h" 15 #include "nsComplexBreaker.h" 16 #include "nsTArray.h" 17 #include "nsUnicodeProperties.h" 18 #include "nsThreadUtils.h" 19 #include "mozilla/CheckedInt.h" 20 #include "mozilla/ClearOnShutdown.h" 21 #include "mozilla/intl/Segmenter.h" 22 #include "mozilla/intl/UnicodeProperties.h" 23 #include "mozilla/StaticPrefs_intl.h" 24 25 #include <mutex> 26 27 using namespace icu4x; 28 using namespace mozilla; 29 using namespace mozilla::intl; 30 using namespace mozilla::unicode; 31 32 /* 33 34 Simplification of Pair Table in JIS X 4051 35 36 1. The Origion Table - in 4.1.3 37 38 In JIS x 4051. The pair table is defined as below 39 40 Class of 41 Leading Class of Trailing Char Class 42 Char 43 44 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 45 * # * # 46 1 X X X X X X X X X X X X X X X X X X X X X E 47 2 X X X X X X 48 3 X X X X X X 49 4 X X X X X X 50 5 X X X X X X 51 6 X X X X X X 52 7 X X X X X X X 53 8 X X X X X X E 54 9 X X X X X X 55 10 X X X X X X 56 11 X X X X X X 57 12 X X X X X X 58 13 X X X X X X X 59 14 X X X X X X X 60 15 X X X X X X X X X 61 16 X X X X X X X X 62 17 X X X X X E 63 18 X X X X X X X X X 64 19 X E E E E E X X X X X X X X X X X X E X E E 65 20 X X X X X E 66 67 * Same Char 68 # Other Char 69 70 X Cannot Break 71 72 The classes mean: 73 1: Open parenthesis 74 2: Close parenthesis 75 3: Prohibit a line break before 76 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") 77 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) 78 6: Full stop 79 7: Non-breakable between same characters 80 8: Prefix (e.g., "$", "NO.") 81 9: Postfix (e.g., "%") 82 10: Ideographic space 83 11: Hiragana 84 12: Japanese characters (except class 11) 85 13: Subscript 86 14: Ruby 87 15: Numeric 88 16: Alphabet 89 17: Space for Western language 90 18: Western characters (except class 17) 91 19: Split line note (Warichu) begin quote 92 20: Split line note (Warichu) end quote 93 94 2. Simplified by remove the class which we do not care 95 96 However, since we do not care about class 13(Subscript), 14(Ruby), 97 16 (Aphabet), 19(split line note begin quote), and 20(split line note end 98 quote) we can simplify this par table into the following 99 100 Class of 101 Leading Class of Trailing Char Class 102 Char 103 104 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 105 106 1 X X X X X X X X X X X X X X X 107 2 X X X X X 108 3 X X X X X 109 4 X X X X X 110 5 X X X X X 111 6 X X X X X 112 7 X X X X X X 113 8 X X X X X X 114 9 X X X X X 115 10 X X X X X 116 11 X X X X X 117 12 X X X X X 118 15 X X X X X X X X 119 17 X X X X X 120 18 X X X X X X X 121 122 3. Simplified by merged classes 123 124 After the 2 simplification, the pair table have some duplication 125 a. class 2, 3, 4, 5, 6, are the same- we can merged them 126 b. class 10, 11, 12, 17 are the same- we can merged them 127 128 We introduce an extra non-breaking pair at [b]/7 to better match 129 the expectations of CSS line-breaking as tested by WPT tests. 130 This added entry is marked as * in the tables below. 131 132 Class of 133 Leading Class of Trailing Char Class 134 Char 135 136 1 [a] 7 8 9 [b]15 18 137 138 1 X X X X X X X X 139 [a] X 140 7 X X 141 8 X X 142 9 X 143 [b] X * 144 15 X X X X 145 18 X X X 146 147 148 4. We add COMPLEX characters and make it breakable w/ all ther class 149 except after class 1 and before class [a] 150 151 Class of 152 Leading Class of Trailing Char Class 153 Char 154 155 1 [a] 7 8 9 [b]15 18 COMPLEX 156 157 1 X X X X X X X X X 158 [a] X 159 7 X X 160 8 X X 161 9 X 162 [b] X * 163 15 X X X X 164 18 X X X 165 COMPLEX X T 166 167 T : need special handling 168 169 170 5. However, we need two special class for some punctuations/parentheses, 171 theirs breaking rules like character class (18), see bug 389056. 172 And also we need character like punctuation that is same behavior with 18, 173 but the characters are not letters of all languages. (e.g., '_') 174 [c]. Based on open parenthesis class (1), but it is not breakable after 175 character class (18) or numeric class (15). 176 [d]. Based on close parenthesis (or punctuation) class (2), but it is not 177 breakable before character class (18) or numeric class (15). 178 179 Class of 180 Leading Class of Trailing Char Class 181 Char 182 183 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] 184 185 1 X X X X X X X X X X X 186 [a] X X X 187 7 X X 188 8 X X 189 9 X 190 [b] X * X 191 15 X X X X X X 192 18 X X X X X 193 COMPLEX X T 194 [c] X X X X X X X X X X X 195 [d] X X X X 196 197 198 6. And Unicode has "NON-BREAK" characters. The lines should be broken around 199 them. But in JIS X 4051, such class is not, therefore, we create [e]. 200 201 Class of 202 Leading Class of Trailing Char Class 203 Char 204 205 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] 206 207 1 X X X X X X X X X X X X 208 [a] X X X 209 7 X X X 210 8 X X X 211 9 X X 212 [b] X * X X 213 15 X X X X X X X 214 18 X X X X X X 215 COMPLEX X T X 216 [c] X X X X X X X X X X X X 217 [d] X X X X X 218 [e] X X X X X X X X X X X X 219 220 221 7. Now we use one bit to encode whether it is breakable, and use 2 bytes 222 for one row, then the bit table will look like: 223 224 18 <- 1 225 226 1 0000 1111 1111 1111 = 0x0FFF 227 [a] 0000 1100 0000 0010 = 0x0C02 228 7 0000 1000 0000 0110 = 0x0806 229 8 0000 1000 0100 0010 = 0x0842 230 9 0000 1000 0000 0010 = 0x0802 231 [b] 0000 1100 0000 0110 = 0x0C06 232 15 0000 1110 1101 0010 = 0x0ED2 233 18 0000 1110 1100 0010 = 0x0EC2 234 COMPLEX 0000 1001 0000 0010 = 0x0902 235 [c] 0000 1111 1111 1111 = 0x0FFF 236 [d] 0000 1100 1100 0010 = 0x0CC2 237 [e] 0000 1111 1111 1111 = 0x0FFF 238 */ 239 240 #define MAX_CLASSES 12 241 242 static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842, 243 0x0802, 0x0C06, 0x0ED2, 0x0EC2, 244 0x0902, 0x0FFF, 0x0CC2, 0x0FFF}; 245 246 /* 247 248 8. And if the character is not enough far from word start, word end and 249 another break point, we should not break in non-CJK languages. 250 I.e., Don't break around 15, 18, [c] and [d], but don't change 251 that if they are related to [b]. 252 253 Class of 254 Leading Class of Trailing Char Class 255 Char 256 257 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] 258 259 1 X X X X X X X X X X X X 260 [a] X X X X X X 261 7 X X X X X X X 262 8 X X X X X X 263 9 X X X X X X 264 [b] X * X X 265 15 X X X X X X X X X X X 266 18 X X X X X X X X X X X 267 COMPLEX X X X T X X X 268 [c] X X X X X X X X X X X X 269 [d] X X X X X X X X X X X 270 [e] X X X X X X X X X X X X 271 272 18 <- 1 273 274 1 0000 1111 1111 1111 = 0x0FFF 275 [a] 0000 1110 1100 0010 = 0x0EC2 276 7 0000 1110 1100 0110 = 0x0EC6 277 8 0000 1110 1100 0010 = 0x0EC2 278 9 0000 1110 1100 0010 = 0x0EC2 279 [b] 0000 1100 0000 0110 = 0x0C06 280 15 0000 1111 1101 1111 = 0x0FDF 281 18 0000 1111 1101 1111 = 0x0FDF 282 COMPLEX 0000 1111 1100 0010 = 0x0FC2 283 [c] 0000 1111 1111 1111 = 0x0FFF 284 [d] 0000 1111 1101 1111 = 0x0FDF 285 [e] 0000 1111 1111 1111 = 0x0FFF 286 */ 287 288 static const uint16_t gPairConservative[MAX_CLASSES] = { 289 0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06, 290 0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF}; 291 292 /* 293 294 9. Now we map the class to number 295 296 0: 1 297 1: [a]- 2, 3, 4, 5, 6 298 2: 7 299 3: 8 300 4: 9 301 5: [b]- 10, 11, 12, 17 302 6: 15 303 7: 18 304 8: COMPLEX 305 9: [c] 306 A: [d] 307 B: [e] 308 309 and they mean: 310 0: Open parenthesis 311 1: Punctuation that prohibits break before 312 2: Non-breakable between same classes 313 3: Prefix 314 4: Postfix 315 5: Breakable character (Spaces and Most Japanese characters) 316 6: Numeric 317 7: Characters 318 8: Need special handling characters (E.g., Thai) 319 9: Open parentheses like Character (See bug 389056) 320 A: Close parenthese (or punctuations) like Character (See bug 389056) 321 B: Non breakable (See bug 390920) 322 323 */ 324 325 #define CLASS_NONE INT8_MAX 326 327 #define CLASS_OPEN 0x00 328 #define CLASS_CLOSE 0x01 329 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 330 #define CLASS_PREFIX 0x03 331 #define CLASS_POSTFFIX 0x04 332 #define CLASS_BREAKABLE 0x05 333 #define CLASS_NUMERIC 0x06 334 #define CLASS_CHARACTER 0x07 335 #define CLASS_COMPLEX 0x08 336 #define CLASS_OPEN_LIKE_CHARACTER 0x09 337 #define CLASS_CLOSE_LIKE_CHARACTER 0x0A 338 #define CLASS_NON_BREAKABLE 0x0B 339 340 #define U_NULL char16_t(0x0000) 341 #define U_SLASH char16_t('/') 342 #define U_SPACE char16_t(' ') 343 #define U_HYPHEN char16_t('-') 344 #define U_EQUAL char16_t('=') 345 #define U_PERCENT char16_t('%') 346 #define U_AMPERSAND char16_t('&') 347 #define U_SEMICOLON char16_t(';') 348 #define U_BACKSLASH char16_t('\\') 349 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018) 350 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) 351 #define U_OPEN_GUILLEMET char16_t(0x00AB) 352 353 #define NEED_CONTEXTUAL_ANALYSIS(c) \ 354 (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \ 355 (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE || \ 356 (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET) 357 358 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) 359 360 static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) { 361 return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f); 362 } 363 364 static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) { 365 return ((0xff66 <= (u)) && ((u) <= 0xff70)); 366 } 367 368 static inline int IS_CJK_CHAR(char32_t u) { 369 return ( 370 (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) || 371 (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) || 372 (0x20000 <= (u) && (u) <= 0x2fffd)); 373 } 374 375 static inline bool IS_NONBREAKABLE_SPACE(char16_t u) { 376 return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE 377 } 378 379 static inline bool IS_HYPHEN(char16_t u) { 380 return (u == U_HYPHEN || u == 0x2010 || // HYPHEN 381 u == 0x2012 || // FIGURE DASH 382 u == 0x2013 || // EN DASH 383 #if ANDROID || XP_WIN 384 /* Bug 1647377: On Android and Windows, we don't have a "platform" 385 * backend that supports Tibetan (nsRuleBreaker.cpp only knows about 386 * Thai, and ScriptBreak doesn't handle Tibetan well either), so 387 * instead we just treat the TSHEG like a hyphen to provide basic 388 * line-breaking possibilities. 389 */ 390 u == 0x0F0B || // TIBETAN MARK INTERSYLLABIC TSHEG 391 #endif 392 u == 0x058A); // ARMENIAN HYPHEN 393 } 394 395 static int8_t GetClass(uint32_t u, LineBreakRule aLevel, 396 bool aIsChineseOrJapanese) { 397 // Mapping for Unicode LineBreak.txt classes to the (simplified) set of 398 // character classes used here. 399 // XXX The mappings here were derived by comparing the Unicode LineBreak 400 // values of BMP characters to the classes our existing GetClass returns 401 // for the same codepoints; in cases where characters with the same 402 // LineBreak class mapped to various classes here, I picked what seemed 403 // the most prevalent equivalence. 404 // Some of these are unclear to me, but currently they are ONLY used 405 // for characters not handled by the old code below, so all the JISx405 406 // special cases should already be accounted for. 407 static const int8_t sUnicodeLineBreakToClass[] = { 408 /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER, 409 /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER, 410 /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER, 411 /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER, 412 /* BREAK_AFTER = 4, [BA] */ CLASS_BREAKABLE, 413 /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER, 414 /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER, 415 /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER, 416 /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CLOSE_LIKE_CHARACTER, 417 /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER, 418 /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE, 419 /* EXCLAMATION = 11, [EX] */ CLASS_CLOSE_LIKE_CHARACTER, 420 /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE, 421 /* HYPHEN = 13, [HY] */ CLASS_CHARACTER, 422 /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE, 423 /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER, 424 /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER, 425 /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE, 426 /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER, 427 /* NUMERIC = 19, [NU] */ CLASS_NUMERIC, 428 /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_OPEN_LIKE_CHARACTER, 429 /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CLOSE_LIKE_CHARACTER, 430 /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER, 431 /* QUOTATION = 23, [QU] */ CLASS_CHARACTER, 432 /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER, 433 /* SURROGATE = 25, [SG] */ CLASS_CHARACTER, 434 /* SPACE = 26, [SP] */ CLASS_BREAKABLE, 435 /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER, 436 /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE, 437 /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER, 438 /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE, 439 /* H2 = 31, [H2] */ CLASS_BREAKABLE, 440 /* H3 = 32, [H3] */ CLASS_BREAKABLE, 441 /* JL = 33, [JL] */ CLASS_CHARACTER, 442 /* JT = 34, [JT] */ CLASS_CHARACTER, 443 /* JV = 35, [JV] */ CLASS_CHARACTER, 444 /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER, 445 /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE, 446 /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER, 447 /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER, 448 /* E_BASE = 40, [EB] */ CLASS_BREAKABLE, 449 /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER, 450 /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER, 451 /* AKSARA = 43, [AK] */ CLASS_CHARACTER, 452 /* AKSARA_PREBASE = 44, [AP] */ CLASS_CHARACTER, 453 /* AKSARA_START = 45, [AS] */ CLASS_CHARACTER, 454 /* VIRAMA_FINAL = 46, [VF] */ CLASS_CHARACTER, 455 /* VIRAMA = 47, [VI] */ CLASS_CHARACTER, 456 /* UNAMBIGUOUS_HYPHEN = 48 [HH] */ CLASS_BREAKABLE, 457 }; 458 459 static_assert(U_LB_COUNT == std::size(sUnicodeLineBreakToClass), 460 "Gecko vs ICU LineBreak class mismatch"); 461 462 auto cls = GetLineBreakClass(u); 463 MOZ_ASSERT(cls < std::size(sUnicodeLineBreakToClass)); 464 465 // Overrides based on rules for the different line-break values given in 466 // https://drafts.csswg.org/css-text-3/#line-break-property 467 switch (aLevel) { 468 case LineBreakRule::Auto: 469 // For now, just use legacy Gecko behavior. 470 // XXX Possible enhancement - vary strictness according to line width 471 // or other criteria. 472 break; 473 case LineBreakRule::Strict: 474 if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER || 475 (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) { 476 return CLASS_CLOSE; 477 } 478 if (cls == U_LB_INSEPARABLE) { 479 return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; 480 } 481 if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || 482 u == 0x30FD || u == 0x30FE) { 483 return CLASS_CLOSE_LIKE_CHARACTER; 484 } 485 if (aIsChineseOrJapanese) { 486 if (cls == U_LB_POSTFIX_NUMERIC && 487 UnicodeProperties::IsEastAsianWidthAFW(u)) { 488 return CLASS_CLOSE_LIKE_CHARACTER; 489 } 490 if (cls == U_LB_PREFIX_NUMERIC && 491 UnicodeProperties::IsEastAsianWidthAFW(u)) { 492 return CLASS_OPEN_LIKE_CHARACTER; 493 } 494 if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { 495 return CLASS_CLOSE_LIKE_CHARACTER; 496 } 497 } 498 break; 499 case LineBreakRule::Normal: 500 if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { 501 return CLASS_BREAKABLE; 502 } 503 if (cls == U_LB_INSEPARABLE) { 504 return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; 505 } 506 if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || 507 u == 0x30FD || u == 0x30FE) { 508 return CLASS_CLOSE_LIKE_CHARACTER; 509 } 510 if (aIsChineseOrJapanese) { 511 if (cls == U_LB_POSTFIX_NUMERIC && 512 UnicodeProperties::IsEastAsianWidthAFW(u)) { 513 return CLASS_CLOSE_LIKE_CHARACTER; 514 } 515 if (cls == U_LB_PREFIX_NUMERIC && 516 UnicodeProperties::IsEastAsianWidthAFW(u)) { 517 return CLASS_OPEN_LIKE_CHARACTER; 518 } 519 if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { 520 return CLASS_BREAKABLE; 521 } 522 } 523 break; 524 case LineBreakRule::Loose: 525 if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { 526 return CLASS_BREAKABLE; 527 } 528 if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || 529 u == 0x30FD || u == 0x30FE) { 530 return CLASS_BREAKABLE; 531 } 532 if (cls == U_LB_INSEPARABLE) { 533 return CLASS_BREAKABLE; 534 } 535 if (aIsChineseOrJapanese) { 536 if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 || 537 u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 || 538 u == 0xFF01 || u == 0xFF1F) { 539 return CLASS_BREAKABLE; 540 } 541 if (cls == U_LB_POSTFIX_NUMERIC && 542 UnicodeProperties::IsEastAsianWidthAFW(u)) { 543 return CLASS_BREAKABLE; 544 } 545 if (cls == U_LB_PREFIX_NUMERIC && 546 UnicodeProperties::IsEastAsianWidthAFW(u)) { 547 return CLASS_BREAKABLE; 548 } 549 if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { 550 return CLASS_BREAKABLE; 551 } 552 } 553 break; 554 case LineBreakRule::Anywhere: 555 MOZ_ASSERT_UNREACHABLE("should have been handled already"); 556 break; 557 } 558 559 if (u < 0x10000) { 560 uint16_t h = u & 0xFF00; 561 uint16_t l = u & 0x00ff; 562 563 // Handle 3 range table first 564 if (0x0000 == h) { 565 return GETCLASSFROMTABLE(gLBClass00, l); 566 } 567 if (0x1700 == h) { 568 return GETCLASSFROMTABLE(gLBClass17, l); 569 } 570 if (NS_NeedsPlatformNativeHandling(u)) { 571 return CLASS_COMPLEX; 572 } 573 if (0x0E00 == h) { 574 return GETCLASSFROMTABLE(gLBClass0E, l); 575 } 576 if (0x2000 == h) { 577 return GETCLASSFROMTABLE(gLBClass20, l); 578 } 579 if (0x2100 == h) { 580 return GETCLASSFROMTABLE(gLBClass21, l); 581 } 582 if (0x3000 == h) { 583 return GETCLASSFROMTABLE(gLBClass30, l); 584 } 585 if (0xff00 == h) { 586 if (l <= 0x0060) { // Fullwidth ASCII variant 587 // Previously, we treated Fullwidth chars the same as their ASCII 588 // counterparts, but UAX#14 (LineBreak.txt) disagrees with this and 589 // treats many of them as ideograph-like. 590 return sUnicodeLineBreakToClass[cls]; 591 } 592 if (l < 0x00a0) { // Halfwidth Katakana variants 593 switch (l) { 594 case 0x61: 595 return GetClass(0x3002, aLevel, aIsChineseOrJapanese); 596 case 0x62: 597 return GetClass(0x300c, aLevel, aIsChineseOrJapanese); 598 case 0x63: 599 return GetClass(0x300d, aLevel, aIsChineseOrJapanese); 600 case 0x64: 601 return GetClass(0x3001, aLevel, aIsChineseOrJapanese); 602 case 0x65: 603 return GetClass(0x30fb, aLevel, aIsChineseOrJapanese); 604 case 0x9e: 605 return GetClass(0x309b, aLevel, aIsChineseOrJapanese); 606 case 0x9f: 607 return GetClass(0x309c, aLevel, aIsChineseOrJapanese); 608 default: 609 if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) { 610 return CLASS_CLOSE; // jis x4051 class 3 611 } 612 return CLASS_BREAKABLE; // jis x4051 class 11 613 } 614 } 615 if (l < 0x00e0) { 616 return CLASS_CHARACTER; // Halfwidth Hangul variants 617 } 618 if (l < 0x00f0) { 619 static char16_t NarrowFFEx[16] = { 620 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, 621 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000}; 622 return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese); 623 } 624 } else if (0x3100 == h) { 625 if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun 626 // XXX: This is per UAX #14, but UAX #14 may change 627 // the line breaking rules about Kanbun and Bopomofo. 628 return CLASS_BREAKABLE; 629 } 630 if (l >= 0xf0) { // Katakana small letters for Ainu 631 return CLASS_CLOSE; 632 } 633 } else if (0x0300 == h) { 634 if (0x4F == l || (0x5C <= l && l <= 0x62)) { 635 return CLASS_NON_BREAKABLE; 636 } 637 } else if (0x0500 == h) { 638 // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) 639 if (l == 0x8A) { 640 return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); 641 } 642 } else if (0x0F00 == h) { 643 // We treat Tibetan TSHEG as a hyphen (when not using platform breaker); 644 // other Tibetan chars with LineBreak class=BA will be handled by the 645 // default sUnicodeLineBreakToClass mapping below. 646 if (l == 0x0B) { 647 return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); 648 } 649 } else if (0x1800 == h) { 650 if (0x0E == l) { 651 return CLASS_NON_BREAKABLE; 652 } 653 } else if (0x1600 == h) { 654 if (0x80 == l) { // U+1680 OGHAM SPACE MARK 655 return CLASS_BREAKABLE; 656 } 657 } else if (u == 0xfeff) { 658 return CLASS_NON_BREAKABLE; 659 } 660 } 661 662 return sUnicodeLineBreakToClass[cls]; 663 } 664 665 static bool GetPair(int8_t c1, int8_t c2) { 666 NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); 667 NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); 668 669 return (0 == ((gPair[c1] >> c2) & 0x0001)); 670 } 671 672 static bool GetPairConservative(int8_t c1, int8_t c2) { 673 NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); 674 NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); 675 676 return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); 677 } 678 679 class ContextState { 680 public: 681 ContextState(const char16_t* aText, uint32_t aLength) 682 : mUniText(aText), mText(nullptr), mLength(aLength) { 683 Init(); 684 } 685 686 ContextState(const uint8_t* aText, uint32_t aLength) 687 : mUniText(nullptr), mText(aText), mLength(aLength) { 688 Init(); 689 } 690 691 uint32_t Length() const { return mLength; } 692 uint32_t Index() const { return mIndex; } 693 694 // This gets a single code unit of the text, without checking for surrogates 695 // (in the case of a 16-bit text buffer). That's OK if we're only checking for 696 // specific characters that are known to be BMP values. 697 char16_t GetCodeUnitAt(uint32_t aIndex) const { 698 MOZ_ASSERT(aIndex < mLength, "Out of range!"); 699 return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); 700 } 701 702 // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs 703 // as necessary. It must ONLY be called for 16-bit text, not 8-bit. 704 char32_t GetUnicodeCharAt(uint32_t aIndex) const { 705 MOZ_ASSERT(mUniText, "Only for 16-bit text!"); 706 MOZ_ASSERT(aIndex < mLength, "Out of range!"); 707 char32_t c = mUniText[aIndex]; 708 if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) { 709 c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]); 710 } 711 return c; 712 } 713 714 void AdvanceIndex() { ++mIndex; } 715 716 void NotifyBreakBefore() { mLastBreakIndex = mIndex; } 717 718 // A word of western language should not be broken. But even if the word has 719 // only ASCII characters, non-natural context words should be broken, e.g., 720 // URL and file path. For protecting the natural words, we should use 721 // conservative breaking rules at following conditions: 722 // 1. at near the start of word 723 // 2. at near the end of word 724 // 3. at near the latest broken point 725 // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters, 726 // which varies depending whether we are looking at a letter or a non-letter 727 // character: for non-letters, we use an extended "conservative" range. 728 729 #define CONSERVATIVE_RANGE_LETTER 2 730 #define CONSERVATIVE_RANGE_OTHER 6 731 732 bool UseConservativeBreaking(uint32_t aOffset = 0) const { 733 if (mHasCJKChar) return false; 734 uint32_t index = mIndex + aOffset; 735 736 // If the character at index is a letter (rather than various punctuation 737 // characters, etc) then we want a shorter "conservative" range 738 uint32_t conservativeRangeStart, conservativeRangeEnd; 739 if (index < mLength && 740 nsUGenCategory::kLetter == 741 (mText ? GetGenCategory(mText[index]) 742 : GetGenCategory(GetUnicodeCharAt(index)))) { 743 // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start 744 // to get more balanced behavior (if we break off a 2-letter prefix, 745 // that means the break will actually be three letters from start of 746 // word, to include the hyphen; whereas a 2-letter suffix will be 747 // broken only two letters from end of word). 748 conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER; 749 conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1; 750 } else { 751 conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER; 752 } 753 754 bool result = (index < conservativeRangeStart || 755 mLength - index < conservativeRangeEnd || 756 index - mLastBreakIndex < conservativeRangeStart); 757 if (result || !mHasNonbreakableSpace) return result; 758 759 // This text has no-breakable space, we need to check whether the index 760 // is near it. 761 762 // Note that index is always larger than conservativeRange here. 763 for (uint32_t i = index; index - conservativeRangeStart < i; --i) { 764 if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true; 765 } 766 // Note that index is always less than mLength - conservativeRange. 767 for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) { 768 if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true; 769 } 770 return false; 771 } 772 773 bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; } 774 void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; } 775 776 bool HasPreviousSlash() const { return mHasPreviousSlash; } 777 void NotifySeenSlash() { mHasPreviousSlash = true; } 778 779 bool HasPreviousBackslash() const { return mHasPreviousBackslash; } 780 void NotifySeenBackslash() { mHasPreviousBackslash = true; } 781 782 uint32_t GetPreviousNonHyphenCharacter() const { 783 return mPreviousNonHyphenCharacter; 784 } 785 void NotifyNonHyphenCharacter(uint32_t ch) { 786 mPreviousNonHyphenCharacter = ch; 787 } 788 789 private: 790 void Init() { 791 mIndex = 0; 792 mLastBreakIndex = 0; 793 mPreviousNonHyphenCharacter = U_NULL; 794 mHasCJKChar = false; 795 mHasNonbreakableSpace = false; 796 mHasPreviousEqualsSign = false; 797 mHasPreviousSlash = false; 798 mHasPreviousBackslash = false; 799 800 if (mText) { 801 // 8-bit text: we only need to check for 802 for (uint32_t i = 0; i < mLength; ++i) { 803 if (IS_NONBREAKABLE_SPACE(mText[i])) { 804 mHasNonbreakableSpace = true; 805 break; 806 } 807 } 808 } else { 809 // 16-bit text: handle surrogates and check for CJK as well as 810 for (uint32_t i = 0; i < mLength; ++i) { 811 char32_t u = GetUnicodeCharAt(i); 812 if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) { 813 mHasNonbreakableSpace = true; 814 if (mHasCJKChar) { 815 break; 816 } 817 } else if (!mHasCJKChar && IS_CJK_CHAR(u)) { 818 mHasCJKChar = true; 819 if (mHasNonbreakableSpace) { 820 break; 821 } 822 } 823 if (u > 0xFFFFu) { 824 ++i; // step over trailing low surrogate 825 } 826 } 827 } 828 } 829 830 const char16_t* const mUniText; 831 const uint8_t* const mText; 832 833 uint32_t mIndex; 834 const uint32_t mLength; // length of text 835 uint32_t mLastBreakIndex; 836 char32_t mPreviousNonHyphenCharacter; // The last character we have seen 837 // which is not U_HYPHEN 838 bool mHasCJKChar; // if the text has CJK character, this is true. 839 bool mHasNonbreakableSpace; // if the text has no-breakable space, 840 // this is true. 841 bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL 842 bool mHasPreviousSlash; // True if we have seen a U_SLASH 843 bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH 844 }; 845 846 static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next, 847 ContextState& aState, LineBreakRule aLevel, 848 bool aIsChineseOrJapanese) { 849 // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. 850 851 if (IS_HYPHEN(cur)) { 852 // If next character is hyphen, we don't need to break between them. 853 if (IS_HYPHEN(next)) return CLASS_CHARACTER; 854 // If prev and next characters are numeric, it may be in Math context. 855 // So, we should not break here. 856 bool prevIsNum = IS_ASCII_DIGIT(prev); 857 bool nextIsNum = IS_ASCII_DIGIT(next); 858 if (prevIsNum && nextIsNum) return CLASS_NUMERIC; 859 // If one side is numeric and the other is a character, or if both sides are 860 // characters, the hyphen should be breakable. 861 if (!aState.UseConservativeBreaking(1)) { 862 char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); 863 if (prevOfHyphen && next) { 864 int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese); 865 int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese); 866 bool prevIsNumOrCharOrClose = 867 prevIsNum || 868 (prevClass == CLASS_CHARACTER && 869 !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || 870 prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER; 871 bool nextIsNumOrCharOrOpen = 872 nextIsNum || 873 (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || 874 nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER || 875 next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE || 876 next == U_OPEN_GUILLEMET; 877 if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { 878 return CLASS_CLOSE; 879 } 880 } 881 } 882 } else { 883 aState.NotifyNonHyphenCharacter(cur); 884 if (cur == U_SLASH || cur == U_BACKSLASH) { 885 // If this is immediately after same char, we should not break here. 886 if (prev == cur) return CLASS_CHARACTER; 887 // If this text has two or more (BACK)SLASHs, this may be file path or 888 // URL. Make sure to compute shouldReturn before we notify on this slash. 889 bool shouldReturn = !aState.UseConservativeBreaking() && 890 (cur == U_SLASH ? aState.HasPreviousSlash() 891 : aState.HasPreviousBackslash()); 892 893 if (cur == U_SLASH) { 894 aState.NotifySeenSlash(); 895 } else { 896 aState.NotifySeenBackslash(); 897 } 898 899 if (shouldReturn) return CLASS_OPEN; 900 } else if (cur == U_PERCENT) { 901 // If this is a part of the param of URL, we should break before. 902 if (!aState.UseConservativeBreaking()) { 903 if (aState.Index() >= 3 && 904 aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT) 905 return CLASS_OPEN; 906 if (aState.Index() + 3 < aState.Length() && 907 aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT) 908 return CLASS_OPEN; 909 } 910 } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { 911 // If this may be a separator of params of URL, we should break after. 912 if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign()) 913 return CLASS_CLOSE; 914 } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE || 915 cur == U_OPEN_GUILLEMET) { 916 // for CJK usage, we treat these as openers to allow a break before them, 917 // but otherwise treat them as normal characters because quote mark usage 918 // in various Western languages varies too much; see bug #450088 919 // discussion. 920 if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) 921 return CLASS_OPEN; 922 } else { 923 NS_ERROR("Forgot to handle the current character!"); 924 } 925 } 926 return GetClass(cur, aLevel, aIsChineseOrJapanese); 927 } 928 929 int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) { 930 MOZ_ASSERT(aText); 931 932 if (aPos >= aLen) { 933 return NS_LINEBREAKER_NEED_MORE_TEXT; 934 } 935 936 bool textNeedsComplexLineBreak = false; 937 int32_t begin, end; 938 939 for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { 940 if (IS_CJK_CHAR(aText[begin]) || 941 NS_NeedsPlatformNativeHandling(aText[begin])) { 942 textNeedsComplexLineBreak = true; 943 } 944 } 945 for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { 946 if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { 947 textNeedsComplexLineBreak = true; 948 } 949 } 950 951 int32_t ret; 952 if (!textNeedsComplexLineBreak) { 953 // No complex text character, do not try to do complex line break. 954 // (This is required for serializers. See Bug #344816.) 955 ret = end; 956 } else { 957 AutoTArray<uint8_t, 2000> breakState; 958 // XXX(Bug 1631371) Check if this should use a fallible operation as it 959 // pretended earlier. 960 breakState.AppendElements(end - begin); 961 ComputeBreakPositions(aText + begin, end - begin, WordBreakRule::Normal, 962 LineBreakRule::Auto, false, breakState.Elements()); 963 964 ret = aPos; 965 do { 966 ++ret; 967 } while (begin < ret && ret < end && !breakState[ret - begin]); 968 } 969 970 return ret; 971 } 972 973 static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) { 974 auto affectedByKeepAll = [](uint8_t aLBClass) { 975 switch (aLBClass) { 976 // Per https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all: 977 // "implicit soft wrap opportunities between typographic letter units 978 // (or other typographic character units belonging to the NU, AL, AI, 979 // or ID Unicode line breaking classes [UAX14]) are suppressed..." 980 case U_LB_ALPHABETIC: 981 case U_LB_AMBIGUOUS: 982 case U_LB_NUMERIC: 983 case U_LB_IDEOGRAPHIC: 984 // Additional classes that should be treated similarly, but have been 985 // broken out as separate classes in newer Unicode versions: 986 case U_LB_H2: 987 case U_LB_H3: 988 case U_LB_JL: 989 case U_LB_JV: 990 case U_LB_JT: 991 case U_LB_CONDITIONAL_JAPANESE_STARTER: 992 return true; 993 default: 994 return false; 995 } 996 }; 997 return affectedByKeepAll(GetLineBreakClass(aPrev)) && 998 affectedByKeepAll(GetLineBreakClass(aCh)); 999 } 1000 1001 static LineBreakStrictness ConvertLineBreakRuleToICU4X(LineBreakRule aLevel) { 1002 switch (aLevel) { 1003 case LineBreakRule::Auto: 1004 return LineBreakStrictness::Strict; 1005 case LineBreakRule::Strict: 1006 return LineBreakStrictness::Strict; 1007 case LineBreakRule::Loose: 1008 return LineBreakStrictness::Loose; 1009 case LineBreakRule::Normal: 1010 return LineBreakStrictness::Normal; 1011 case LineBreakRule::Anywhere: 1012 return LineBreakStrictness::Anywhere; 1013 } 1014 MOZ_ASSERT_UNREACHABLE("should have been handled already"); 1015 return LineBreakStrictness::Normal; 1016 } 1017 1018 static LineBreakWordOption ConvertWordBreakRuleToICU4X( 1019 WordBreakRule aWordBreak) { 1020 switch (aWordBreak) { 1021 case WordBreakRule::Normal: 1022 return LineBreakWordOption::Normal; 1023 case WordBreakRule::BreakAll: 1024 return LineBreakWordOption::BreakAll; 1025 case WordBreakRule::KeepAll: 1026 return LineBreakWordOption::KeepAll; 1027 } 1028 MOZ_ASSERT_UNREACHABLE("should have been handled already"); 1029 return LineBreakWordOption::Normal; 1030 } 1031 1032 static capi::LineSegmenter* sLineSegmenter = nullptr; 1033 static capi::Locale* sZhLocale = nullptr; 1034 1035 static capi::LineSegmenter* GetDefaultLineSegmenter() { 1036 static std::once_flag sOnce; 1037 1038 std::call_once(sOnce, [] { 1039 sLineSegmenter = capi::icu4x_LineSegmenter_create_auto_mv1(); 1040 }); 1041 1042 return sLineSegmenter; 1043 } 1044 1045 static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak, 1046 LineBreakRule aLevel, 1047 bool aIsChineseOrJapanese) { 1048 return aWordBreak == WordBreakRule::Normal && 1049 (aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) && 1050 !aIsChineseOrJapanese; 1051 } 1052 1053 static void InitDefaultLocale() { 1054 static std::once_flag sOnce; 1055 std::call_once(sOnce, [] { 1056 auto locale = capi::icu4x_Locale_from_string_mv1( 1057 diplomat::capi::DiplomatStringView{"zh", 2}); 1058 if (locale.is_ok) { 1059 sZhLocale = locale.ok; 1060 } 1061 }); 1062 } 1063 1064 static capi::LineSegmenter* GetLineSegmenter(bool aUseDefault, 1065 WordBreakRule aWordBreak, 1066 LineBreakRule aLevel, 1067 bool aIsChineseOrJapanese) { 1068 if (aUseDefault) { 1069 MOZ_ASSERT( 1070 UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese)); 1071 return GetDefaultLineSegmenter(); 1072 } 1073 1074 if (!sZhLocale && aIsChineseOrJapanese) { 1075 InitDefaultLocale(); 1076 } 1077 1078 LineBreakOptionsV2 options; 1079 options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak); 1080 options.strictness = ConvertLineBreakRuleToICU4X(aLevel); 1081 auto locale = aIsChineseOrJapanese ? sZhLocale : nullptr; 1082 1083 return capi::icu4x_LineSegmenter_create_lstm_with_options_v2_mv1( 1084 locale, options.AsFFI()); 1085 } 1086 1087 void LineBreaker::ComputeBreakPositions( 1088 const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak, 1089 LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { 1090 if (StaticPrefs::intl_icu4x_segmenter_enabled()) { 1091 if (aLength == 1) { 1092 // Although UAX#14 LB2 rule requires never breaking at the start of text 1093 // (SOT), ICU4X line segmenter API is designed to match other segmenter in 1094 // UAX#29 to always break at the start of text. Hence the optimization 1095 // here to avoid calling into ICU4X line segmenter. 1096 aBreakBefore[0] = 1; 1097 return; 1098 } 1099 1100 // We only cache line-breaks if we think the text is likely to hit the slow 1101 // (LSTM) codepath in icu_segmenter. To avoid scanning the entire text just 1102 // to make that decision, we probe every /kStride/ characters. 1103 bool useCache = [=]() { 1104 const uint32_t kStride = 8; 1105 for (uint32_t i = 0; i < aLength; i += kStride) { 1106 if (intl::UnicodeProperties::IsScriptioContinua(aChars[i])) { 1107 return true; 1108 } 1109 } 1110 return false; 1111 }(); 1112 Maybe<LineBreakCache::Entry> entry; 1113 if (useCache) { 1114 LineBreakCache::KeyType key{aChars, aLength, aWordBreak, aLevel, 1115 aIsChineseOrJapanese}; 1116 entry.emplace(LineBreakCache::Cache()->Lookup(key)); 1117 if (*entry) { 1118 auto& breakBefore = entry->Data().mBreaks; 1119 LineBreakCache::CopyAndFill(breakBefore, aBreakBefore, 1120 aBreakBefore + aLength); 1121 return; 1122 } 1123 } 1124 1125 memset(aBreakBefore, 0, aLength); 1126 1127 CheckedInt<int32_t> length = aLength; 1128 if (length.isValid()) { 1129 const bool useDefault = 1130 UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); 1131 auto lineSegmenter = GetLineSegmenter(useDefault, aWordBreak, aLevel, 1132 aIsChineseOrJapanese); 1133 auto segmenter = LineSegmenter::FromFFI(lineSegmenter); 1134 auto iterator = 1135 segmenter->segment16(std::u16string_view{aChars, aLength}); 1136 1137 while (true) { 1138 const int32_t nextPos = iterator->next(); 1139 if (nextPos < 0 || nextPos >= length.value()) { 1140 break; 1141 } 1142 aBreakBefore[nextPos] = 1; 1143 } 1144 1145 if (!useDefault) { 1146 capi::icu4x_LineSegmenter_destroy_mv1(lineSegmenter); 1147 } 1148 } 1149 1150 if (useCache) { 1151 // As a very simple memory saving measure we trim off trailing elements 1152 // that are false before caching. 1153 auto* afterLastTrue = aBreakBefore + aLength; 1154 while (!*(afterLastTrue - 1)) { 1155 if (--afterLastTrue == aBreakBefore) { 1156 break; 1157 } 1158 } 1159 1160 entry->Set(LineBreakCache::EntryType{ 1161 nsString(aChars, aLength), 1162 nsTArray<uint8_t>(aBreakBefore, afterLastTrue - aBreakBefore), 1163 aWordBreak, aLevel, aIsChineseOrJapanese}); 1164 } 1165 1166 return; 1167 } 1168 1169 uint32_t cur; 1170 int8_t lastClass = CLASS_NONE; 1171 ContextState state(aChars, aLength); 1172 1173 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { 1174 char32_t ch = state.GetUnicodeCharAt(cur); 1175 uint32_t chLen = ch > 0xFFFFu ? 2 : 1; 1176 int8_t cl; 1177 1178 auto prev = [=]() -> char32_t { 1179 if (!cur) { 1180 return 0; 1181 } 1182 char32_t c = aChars[cur - 1]; 1183 if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], c)) { 1184 c = SURROGATE_TO_UCS4(aChars[cur - 2], c); 1185 } 1186 return c; 1187 }; 1188 1189 if (NEED_CONTEXTUAL_ANALYSIS(ch)) { 1190 char32_t next; 1191 if (cur + chLen < aLength) { 1192 next = state.GetUnicodeCharAt(cur + chLen); 1193 } else { 1194 next = 0; 1195 } 1196 cl = ContextualAnalysis(prev(), ch, next, state, aLevel, 1197 aIsChineseOrJapanese); 1198 } else { 1199 if (ch == U_EQUAL) state.NotifySeenEqualsSign(); 1200 state.NotifyNonHyphenCharacter(ch); 1201 cl = GetClass(ch, aLevel, aIsChineseOrJapanese); 1202 } 1203 1204 // To implement word-break:break-all, we overwrite the line-break class of 1205 // alphanumeric characters so they are treated the same as ideographic. 1206 // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE, 1207 // _CLOSE_LIKE_CHARACTER, or _NUMERIC by GetClass(), but those classes also 1208 // include others that we don't want to touch here, so we re-check the 1209 // Unicode line-break class to determine which ones to modify. 1210 if (aWordBreak == WordBreakRule::BreakAll && 1211 (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || 1212 cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { 1213 auto cls = GetLineBreakClass(ch); 1214 if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || 1215 cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT || 1216 /* Additional Japanese and Korean LB classes; CSS Text spec doesn't 1217 explicitly mention these, but this appears to give expected 1218 behavior (spec issue?) */ 1219 cls == U_LB_CONDITIONAL_JAPANESE_STARTER || 1220 (cls >= U_LB_H2 && cls <= U_LB_JV)) { 1221 cl = CLASS_BREAKABLE; 1222 } 1223 } 1224 1225 bool allowBreak = false; 1226 if (cur > 0) { 1227 NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, 1228 "Loop should have prevented adjacent complex chars here"); 1229 allowBreak = 1230 (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) 1231 : GetPair(lastClass, cl)); 1232 // Special cases where a normally-allowed break is suppressed: 1233 if (allowBreak) { 1234 // word-break:keep-all suppresses breaks between certain line-break 1235 // classes. 1236 if (aWordBreak == WordBreakRule::KeepAll && 1237 SuppressBreakForKeepAll(prev(), ch)) { 1238 allowBreak = false; 1239 } 1240 // We also don't allow a break within a run of U+3000 chars unless 1241 // word-break:break-all is in effect. 1242 if (ch == 0x3000 && prev() == 0x3000 && 1243 aWordBreak != WordBreakRule::BreakAll) { 1244 allowBreak = false; 1245 } 1246 } 1247 } 1248 aBreakBefore[cur] = allowBreak; 1249 if (allowBreak) state.NotifyBreakBefore(); 1250 lastClass = cl; 1251 if (CLASS_COMPLEX == cl) { 1252 uint32_t end = cur + chLen; 1253 1254 while (end < aLength) { 1255 char32_t c = state.GetUnicodeCharAt(end); 1256 if (CLASS_COMPLEX != GetClass(c, aLevel, false)) { 1257 break; 1258 } 1259 ++end; 1260 if (c > 0xFFFFU) { // it was a surrogate pair 1261 ++end; 1262 } 1263 } 1264 1265 if (aWordBreak == WordBreakRule::BreakAll) { 1266 // For break-all, we don't need to run a dictionary-based breaking 1267 // algorithm, we just allow breaks between all grapheme clusters. 1268 GraphemeClusterBreakIteratorUtf16 ci( 1269 Span<const char16_t>(aChars + cur, end - cur)); 1270 while (Maybe<uint32_t> pos = ci.Next()) { 1271 aBreakBefore[cur + *pos] = true; 1272 } 1273 } else { 1274 ComplexBreaker::GetBreaks(aChars + cur, end - cur, aBreakBefore + cur); 1275 // restore breakability at chunk begin, which was always set to false 1276 // by the complex line breaker 1277 aBreakBefore[cur] = allowBreak; 1278 } 1279 1280 cur = end - 1; 1281 } 1282 1283 if (chLen == 2) { 1284 // Supplementary-plane character: mark that we cannot break before the 1285 // trailing low surrogate, and advance past it. 1286 ++cur; 1287 aBreakBefore[cur] = false; 1288 state.AdvanceIndex(); 1289 } 1290 } 1291 } 1292 1293 void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength, 1294 WordBreakRule aWordBreak, 1295 LineBreakRule aLevel, 1296 bool aIsChineseOrJapanese, 1297 uint8_t* aBreakBefore) { 1298 if (StaticPrefs::intl_icu4x_segmenter_enabled()) { 1299 if (aLength == 1) { 1300 // Although UAX#14 LB2 rule requires never breaking at the start of text 1301 // (SOT), ICU4X line segmenter API is designed to match other segmenter in 1302 // UAX#29 to always break at the start of text. Hence the optimization 1303 // here to avoid calling into ICU4X line segmenter. 1304 aBreakBefore[0] = 1; 1305 return; 1306 } 1307 1308 memset(aBreakBefore, 0, aLength); 1309 1310 CheckedInt<int32_t> length = aLength; 1311 if (!length.isValid()) { 1312 return; 1313 } 1314 1315 const bool useDefault = 1316 UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); 1317 auto lineSegmenter = 1318 GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); 1319 auto segmenter = icu4x::LineSegmenter::FromFFI(lineSegmenter); 1320 auto iterator = segmenter->segment_latin1( 1321 diplomat::span<const uint8_t>{aChars, aLength}); 1322 1323 while (true) { 1324 const int32_t nextPos = iterator->next(); 1325 if (nextPos < 0 || nextPos >= length.value()) { 1326 break; 1327 } 1328 aBreakBefore[nextPos] = 1; 1329 } 1330 1331 if (!useDefault) { 1332 capi::icu4x_LineSegmenter_destroy_mv1(lineSegmenter); 1333 } 1334 return; 1335 } 1336 1337 uint32_t cur; 1338 int8_t lastClass = CLASS_NONE; 1339 ContextState state(aChars, aLength); 1340 1341 for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { 1342 char32_t ch = aChars[cur]; 1343 int8_t cl; 1344 1345 if (NEED_CONTEXTUAL_ANALYSIS(ch)) { 1346 cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, 1347 cur + 1 < aLength ? aChars[cur + 1] : U_NULL, 1348 state, aLevel, aIsChineseOrJapanese); 1349 } else { 1350 if (ch == U_EQUAL) state.NotifySeenEqualsSign(); 1351 state.NotifyNonHyphenCharacter(ch); 1352 cl = GetClass(ch, aLevel, aIsChineseOrJapanese); 1353 } 1354 if (aWordBreak == WordBreakRule::BreakAll && 1355 (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || 1356 cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { 1357 auto cls = GetLineBreakClass(ch); 1358 // Don't need to check additional Japanese/Korean classes in 8-bit 1359 if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || 1360 cls == U_LB_COMPLEX_CONTEXT) { 1361 cl = CLASS_BREAKABLE; 1362 } 1363 } 1364 1365 bool allowBreak = false; 1366 if (cur > 0) { 1367 allowBreak = 1368 (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) 1369 : GetPair(lastClass, cl)) && 1370 (aWordBreak != WordBreakRule::KeepAll || 1371 !SuppressBreakForKeepAll(aChars[cur - 1], ch)); 1372 } 1373 aBreakBefore[cur] = allowBreak; 1374 if (allowBreak) state.NotifyBreakBefore(); 1375 lastClass = cl; 1376 } 1377 } 1378 1379 void LineBreaker::Shutdown() { 1380 if (sLineSegmenter) { 1381 capi::icu4x_LineSegmenter_destroy_mv1(sLineSegmenter); 1382 } 1383 if (sZhLocale) { 1384 capi::icu4x_Locale_destroy_mv1(sZhLocale); 1385 } 1386 1387 sLineSegmenter = nullptr; 1388 sZhLocale = nullptr; 1389 }