dictbe.cpp (63993B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include <utility> 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_BREAK_ITERATION 15 16 #include "brkeng.h" 17 #include "dictbe.h" 18 #include "unicode/uniset.h" 19 #include "unicode/chariter.h" 20 #include "unicode/resbund.h" 21 #include "unicode/ubrk.h" 22 #include "unicode/usetiter.h" 23 #include "ubrkimpl.h" 24 #include "utracimp.h" 25 #include "uvectr32.h" 26 #include "uvector.h" 27 #include "uassert.h" 28 #include "unicode/normlzr.h" 29 #include "cmemory.h" 30 #include "dictionarydata.h" 31 32 U_NAMESPACE_BEGIN 33 34 /* 35 ****************************************************************** 36 */ 37 38 DictionaryBreakEngine::DictionaryBreakEngine() { 39 } 40 41 DictionaryBreakEngine::~DictionaryBreakEngine() { 42 } 43 44 UBool 45 DictionaryBreakEngine::handles(UChar32 c, const char*) const { 46 return fSet.contains(c); 47 } 48 49 int32_t 50 DictionaryBreakEngine::findBreaks( UText *text, 51 int32_t startPos, 52 int32_t endPos, 53 UVector32 &foundBreaks, 54 UBool isPhraseBreaking, 55 UErrorCode& status) const { 56 if (U_FAILURE(status)) return 0; 57 int32_t result = 0; 58 59 // Find the span of characters included in the set. 60 // The span to break begins at the current position in the text, and 61 // extends towards the start or end of the text, depending on 'reverse'. 62 63 utext_setNativeIndex(text, startPos); 64 int32_t start = static_cast<int32_t>(utext_getNativeIndex(text)); 65 int32_t current; 66 int32_t rangeStart; 67 int32_t rangeEnd; 68 UChar32 c = utext_current32(text); 69 while ((current = static_cast<int32_t>(utext_getNativeIndex(text))) < endPos && fSet.contains(c)) { 70 utext_next32(text); // TODO: recast loop for postincrement 71 c = utext_current32(text); 72 } 73 rangeStart = start; 74 rangeEnd = current; 75 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status); 76 utext_setNativeIndex(text, current); 77 78 return result; 79 } 80 81 void 82 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 83 fSet = set; 84 // Compact for caching 85 fSet.compact(); 86 } 87 88 /* 89 ****************************************************************** 90 * PossibleWord 91 */ 92 93 // Helper class for improving readability of the Thai/Lao/Khmer word break 94 // algorithm. The implementation is completely inline. 95 96 // List size, limited by the maximum number of words in the dictionary 97 // that form a nested sequence. 98 static const int32_t POSSIBLE_WORD_LIST_MAX = 20; 99 100 class PossibleWord { 101 private: 102 // list of word candidate lengths, in increasing length order 103 // TODO: bytes would be sufficient for word lengths. 104 int32_t count; // Count of candidates 105 int32_t prefix; // The longest match with a dictionary word 106 int32_t offset; // Offset in the text of these candidates 107 int32_t mark; // The preferred candidate's offset 108 int32_t current; // The candidate we're currently looking at 109 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units. 110 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points. 111 112 public: 113 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {} 114 ~PossibleWord() {} 115 116 // Fill the list of candidates if needed, select the longest, and return the number found 117 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 118 119 // Select the currently marked candidate, point after it in the text, and invalidate self 120 int32_t acceptMarked( UText *text ); 121 122 // Back up from the current candidate to the next shorter one; return true if that exists 123 // and point the text after it 124 UBool backUp( UText *text ); 125 126 // Return the longest prefix this candidate location shares with a dictionary word 127 // Return value is in code points. 128 int32_t longestPrefix() { return prefix; } 129 130 // Mark the current candidate as the one we like 131 void markCurrent() { mark = current; } 132 133 // Get length in code points of the marked word. 134 int32_t markedCPLength() { return cpLengths[mark]; } 135 }; 136 137 138 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 139 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 140 int32_t start = static_cast<int32_t>(utext_getNativeIndex(text)); 141 if (start != offset) { 142 offset = start; 143 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix); 144 // Dictionary leaves text after longest prefix, not longest word. Back up. 145 if (count <= 0) { 146 utext_setNativeIndex(text, start); 147 } 148 } 149 if (count > 0) { 150 utext_setNativeIndex(text, start+cuLengths[count-1]); 151 } 152 current = count-1; 153 mark = current; 154 return count; 155 } 156 157 int32_t 158 PossibleWord::acceptMarked( UText *text ) { 159 utext_setNativeIndex(text, offset + cuLengths[mark]); 160 return cuLengths[mark]; 161 } 162 163 164 UBool 165 PossibleWord::backUp( UText *text ) { 166 if (current > 0) { 167 utext_setNativeIndex(text, offset + cuLengths[--current]); 168 return true; 169 } 170 return false; 171 } 172 173 /* 174 ****************************************************************** 175 * ThaiBreakEngine 176 */ 177 178 // How many words in a row are "good enough"? 179 static const int32_t THAI_LOOKAHEAD = 3; 180 181 // Will not combine a non-word with a preceding dictionary word longer than this 182 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3; 183 184 // Will not combine a non-word that shares at least this much prefix with a 185 // dictionary word, with a preceding word 186 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3; 187 188 // Elision character 189 static const int32_t THAI_PAIYANNOI = 0x0E2F; 190 191 // Repeat character 192 static const int32_t THAI_MAIYAMOK = 0x0E46; 193 194 // Minimum word size 195 static const int32_t THAI_MIN_WORD = 2; 196 197 // Minimum number of characters for two words 198 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; 199 200 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 201 : DictionaryBreakEngine(), 202 fDictionary(adoptDictionary) 203 { 204 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); 205 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); 206 UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); 207 if (U_SUCCESS(status)) { 208 setCharacters(thaiWordSet); 209 } 210 fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 211 fMarkSet.add(0x0020); 212 fEndWordSet = thaiWordSet; 213 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 214 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 215 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 216 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 217 fSuffixSet.add(THAI_PAIYANNOI); 218 fSuffixSet.add(THAI_MAIYAMOK); 219 220 // Compact for caching. 221 fMarkSet.compact(); 222 fEndWordSet.compact(); 223 fBeginWordSet.compact(); 224 fSuffixSet.compact(); 225 UTRACE_EXIT_STATUS(status); 226 } 227 228 ThaiBreakEngine::~ThaiBreakEngine() { 229 delete fDictionary; 230 } 231 232 int32_t 233 ThaiBreakEngine::divideUpDictionaryRange( UText *text, 234 int32_t rangeStart, 235 int32_t rangeEnd, 236 UVector32 &foundBreaks, 237 UBool /* isPhraseBreaking */, 238 UErrorCode& status) const { 239 if (U_FAILURE(status)) return 0; 240 utext_setNativeIndex(text, rangeStart); 241 utext_moveIndex32(text, THAI_MIN_WORD_SPAN); 242 if (utext_getNativeIndex(text) >= rangeEnd) { 243 return 0; // Not enough characters for two words 244 } 245 utext_setNativeIndex(text, rangeStart); 246 247 248 uint32_t wordsFound = 0; 249 int32_t cpWordLength = 0; // Word Length in Code Points. 250 int32_t cuWordLength = 0; // Word length in code units (UText native indexing) 251 int32_t current; 252 PossibleWord words[THAI_LOOKAHEAD]; 253 254 utext_setNativeIndex(text, rangeStart); 255 256 while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) { 257 cpWordLength = 0; 258 cuWordLength = 0; 259 260 // Look for candidate words at the current position 261 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 262 263 // If we found exactly one, use that 264 if (candidates == 1) { 265 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 266 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 267 wordsFound += 1; 268 } 269 // If there was more than one, see which one can take us forward the most words 270 else if (candidates > 1) { 271 // If we're already at the end of the range, we're done 272 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 273 goto foundBest; 274 } 275 do { 276 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 277 // Followed by another dictionary word; mark first word as a good candidate 278 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 279 280 // If we're already at the end of the range, we're done 281 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 282 goto foundBest; 283 } 284 285 // See if any of the possible second words is followed by a third word 286 do { 287 // If we find a third word, stop right away 288 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 289 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 290 goto foundBest; 291 } 292 } 293 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 294 } 295 } 296 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 297 foundBest: 298 // Set UText position to after the accepted word. 299 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 300 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 301 wordsFound += 1; 302 } 303 304 // We come here after having either found a word or not. We look ahead to the 305 // next word. If it's not a dictionary word, we will combine it with the word we 306 // just found (if there is one), but only if the preceding word does not exceed 307 // the threshold. 308 // The text iterator should now be positioned at the end of the word we found. 309 310 UChar32 uc = 0; 311 if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) { 312 // if it is a dictionary word, do nothing. If it isn't, then if there is 313 // no preceding word, or the non-word shares less than the minimum threshold 314 // of characters with a dictionary word, then scan to resynchronize 315 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 316 && (cuWordLength == 0 317 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 318 // Look for a plausible word boundary 319 int32_t remaining = rangeEnd - (current+cuWordLength); 320 UChar32 pc; 321 int32_t chars = 0; 322 for (;;) { 323 int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 324 pc = utext_next32(text); 325 int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex; 326 chars += pcSize; 327 remaining -= pcSize; 328 if (remaining <= 0) { 329 break; 330 } 331 uc = utext_current32(text); 332 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 333 // Maybe. See if it's in the dictionary. 334 // NOTE: In the original Apple code, checked that the next 335 // two characters after uc were not 0x0E4C THANTHAKHAT before 336 // checking the dictionary. That is just a performance filter, 337 // but it's not clear it's faster than checking the trie. 338 int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 339 utext_setNativeIndex(text, current + cuWordLength + chars); 340 if (num_candidates > 0) { 341 break; 342 } 343 } 344 } 345 346 // Bump the word count if there wasn't already one 347 if (cuWordLength <= 0) { 348 wordsFound += 1; 349 } 350 351 // Update the length with the passed-over characters 352 cuWordLength += chars; 353 } 354 else { 355 // Back up to where we were for next iteration 356 utext_setNativeIndex(text, current+cuWordLength); 357 } 358 } 359 360 // Never stop before a combining mark. 361 int32_t currPos; 362 while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 363 utext_next32(text); 364 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos; 365 } 366 367 // Look ahead for possible suffixes if a dictionary word does not follow. 368 // We do this in code rather than using a rule so that the heuristic 369 // resynch continues to function. For example, one of the suffix characters 370 // could be a typo in the middle of a word. 371 if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cuWordLength > 0) { 372 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 373 && fSuffixSet.contains(uc = utext_current32(text))) { 374 if (uc == THAI_PAIYANNOI) { 375 if (!fSuffixSet.contains(utext_previous32(text))) { 376 // Skip over previous end and PAIYANNOI 377 utext_next32(text); 378 int32_t paiyannoiIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 379 utext_next32(text); 380 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - paiyannoiIndex; // Add PAIYANNOI to word 381 uc = utext_current32(text); // Fetch next character 382 } 383 else { 384 // Restore prior position 385 utext_next32(text); 386 } 387 } 388 if (uc == THAI_MAIYAMOK) { 389 if (utext_previous32(text) != THAI_MAIYAMOK) { 390 // Skip over previous end and MAIYAMOK 391 utext_next32(text); 392 int32_t maiyamokIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 393 utext_next32(text); 394 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - maiyamokIndex; // Add MAIYAMOK to word 395 } 396 else { 397 // Restore prior position 398 utext_next32(text); 399 } 400 } 401 } 402 else { 403 utext_setNativeIndex(text, current+cuWordLength); 404 } 405 } 406 407 // Did we find a word on this iteration? If so, push it on the break stack 408 if (cuWordLength > 0) { 409 foundBreaks.push((current+cuWordLength), status); 410 } 411 } 412 413 // Don't return a break for the end of the dictionary range if there is one there. 414 if (foundBreaks.peeki() >= rangeEnd) { 415 (void) foundBreaks.popi(); 416 wordsFound -= 1; 417 } 418 419 return wordsFound; 420 } 421 422 /* 423 ****************************************************************** 424 * LaoBreakEngine 425 */ 426 427 // How many words in a row are "good enough"? 428 static const int32_t LAO_LOOKAHEAD = 3; 429 430 // Will not combine a non-word with a preceding dictionary word longer than this 431 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3; 432 433 // Will not combine a non-word that shares at least this much prefix with a 434 // dictionary word, with a preceding word 435 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3; 436 437 // Minimum word size 438 static const int32_t LAO_MIN_WORD = 2; 439 440 // Minimum number of characters for two words 441 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2; 442 443 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 444 : DictionaryBreakEngine(), 445 fDictionary(adoptDictionary) 446 { 447 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); 448 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); 449 UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); 450 if (U_SUCCESS(status)) { 451 setCharacters(laoWordSet); 452 } 453 fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 454 fMarkSet.add(0x0020); 455 fEndWordSet = laoWordSet; 456 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 457 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 458 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 459 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 460 461 // Compact for caching. 462 fMarkSet.compact(); 463 fEndWordSet.compact(); 464 fBeginWordSet.compact(); 465 UTRACE_EXIT_STATUS(status); 466 } 467 468 LaoBreakEngine::~LaoBreakEngine() { 469 delete fDictionary; 470 } 471 472 int32_t 473 LaoBreakEngine::divideUpDictionaryRange( UText *text, 474 int32_t rangeStart, 475 int32_t rangeEnd, 476 UVector32 &foundBreaks, 477 UBool /* isPhraseBreaking */, 478 UErrorCode& status) const { 479 if (U_FAILURE(status)) return 0; 480 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 481 return 0; // Not enough characters for two words 482 } 483 484 uint32_t wordsFound = 0; 485 int32_t cpWordLength = 0; 486 int32_t cuWordLength = 0; 487 int32_t current; 488 PossibleWord words[LAO_LOOKAHEAD]; 489 490 utext_setNativeIndex(text, rangeStart); 491 492 while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) { 493 cuWordLength = 0; 494 cpWordLength = 0; 495 496 // Look for candidate words at the current position 497 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 498 499 // If we found exactly one, use that 500 if (candidates == 1) { 501 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 502 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 503 wordsFound += 1; 504 } 505 // If there was more than one, see which one can take us forward the most words 506 else if (candidates > 1) { 507 // If we're already at the end of the range, we're done 508 if (utext_getNativeIndex(text) >= rangeEnd) { 509 goto foundBest; 510 } 511 do { 512 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 513 // Followed by another dictionary word; mark first word as a good candidate 514 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 515 516 // If we're already at the end of the range, we're done 517 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 518 goto foundBest; 519 } 520 521 // See if any of the possible second words is followed by a third word 522 do { 523 // If we find a third word, stop right away 524 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 525 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 526 goto foundBest; 527 } 528 } 529 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 530 } 531 } 532 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 533 foundBest: 534 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 535 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 536 wordsFound += 1; 537 } 538 539 // We come here after having either found a word or not. We look ahead to the 540 // next word. If it's not a dictionary word, we will combine it with the word we 541 // just found (if there is one), but only if the preceding word does not exceed 542 // the threshold. 543 // The text iterator should now be positioned at the end of the word we found. 544 if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) { 545 // if it is a dictionary word, do nothing. If it isn't, then if there is 546 // no preceding word, or the non-word shares less than the minimum threshold 547 // of characters with a dictionary word, then scan to resynchronize 548 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 549 && (cuWordLength == 0 550 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 551 // Look for a plausible word boundary 552 int32_t remaining = rangeEnd - (current + cuWordLength); 553 UChar32 pc; 554 UChar32 uc; 555 int32_t chars = 0; 556 for (;;) { 557 int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 558 pc = utext_next32(text); 559 int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex; 560 chars += pcSize; 561 remaining -= pcSize; 562 if (remaining <= 0) { 563 break; 564 } 565 uc = utext_current32(text); 566 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 567 // Maybe. See if it's in the dictionary. 568 // TODO: this looks iffy; compare with old code. 569 int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 570 utext_setNativeIndex(text, current + cuWordLength + chars); 571 if (num_candidates > 0) { 572 break; 573 } 574 } 575 } 576 577 // Bump the word count if there wasn't already one 578 if (cuWordLength <= 0) { 579 wordsFound += 1; 580 } 581 582 // Update the length with the passed-over characters 583 cuWordLength += chars; 584 } 585 else { 586 // Back up to where we were for next iteration 587 utext_setNativeIndex(text, current + cuWordLength); 588 } 589 } 590 591 // Never stop before a combining mark. 592 int32_t currPos; 593 while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 594 utext_next32(text); 595 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos; 596 } 597 598 // Look ahead for possible suffixes if a dictionary word does not follow. 599 // We do this in code rather than using a rule so that the heuristic 600 // resynch continues to function. For example, one of the suffix characters 601 // could be a typo in the middle of a word. 602 // NOT CURRENTLY APPLICABLE TO LAO 603 604 // Did we find a word on this iteration? If so, push it on the break stack 605 if (cuWordLength > 0) { 606 foundBreaks.push((current+cuWordLength), status); 607 } 608 } 609 610 // Don't return a break for the end of the dictionary range if there is one there. 611 if (foundBreaks.peeki() >= rangeEnd) { 612 (void) foundBreaks.popi(); 613 wordsFound -= 1; 614 } 615 616 return wordsFound; 617 } 618 619 /* 620 ****************************************************************** 621 * BurmeseBreakEngine 622 */ 623 624 // How many words in a row are "good enough"? 625 static const int32_t BURMESE_LOOKAHEAD = 3; 626 627 // Will not combine a non-word with a preceding dictionary word longer than this 628 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3; 629 630 // Will not combine a non-word that shares at least this much prefix with a 631 // dictionary word, with a preceding word 632 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3; 633 634 // Minimum word size 635 static const int32_t BURMESE_MIN_WORD = 2; 636 637 // Minimum number of characters for two words 638 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2; 639 640 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 641 : DictionaryBreakEngine(), 642 fDictionary(adoptDictionary) 643 { 644 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); 645 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); 646 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels 647 fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); 648 fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); 649 fMarkSet.add(0x0020); 650 if (U_SUCCESS(status)) { 651 setCharacters(fEndWordSet); 652 } 653 654 // Compact for caching. 655 fMarkSet.compact(); 656 fEndWordSet.compact(); 657 fBeginWordSet.compact(); 658 UTRACE_EXIT_STATUS(status); 659 } 660 661 BurmeseBreakEngine::~BurmeseBreakEngine() { 662 delete fDictionary; 663 } 664 665 int32_t 666 BurmeseBreakEngine::divideUpDictionaryRange( UText *text, 667 int32_t rangeStart, 668 int32_t rangeEnd, 669 UVector32 &foundBreaks, 670 UBool /* isPhraseBreaking */, 671 UErrorCode& status ) const { 672 if (U_FAILURE(status)) return 0; 673 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { 674 return 0; // Not enough characters for two words 675 } 676 677 uint32_t wordsFound = 0; 678 int32_t cpWordLength = 0; 679 int32_t cuWordLength = 0; 680 int32_t current; 681 PossibleWord words[BURMESE_LOOKAHEAD]; 682 683 utext_setNativeIndex(text, rangeStart); 684 685 while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) { 686 cuWordLength = 0; 687 cpWordLength = 0; 688 689 // Look for candidate words at the current position 690 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 691 692 // If we found exactly one, use that 693 if (candidates == 1) { 694 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 695 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 696 wordsFound += 1; 697 } 698 // If there was more than one, see which one can take us forward the most words 699 else if (candidates > 1) { 700 // If we're already at the end of the range, we're done 701 if (utext_getNativeIndex(text) >= rangeEnd) { 702 goto foundBest; 703 } 704 do { 705 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 706 // Followed by another dictionary word; mark first word as a good candidate 707 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent(); 708 709 // If we're already at the end of the range, we're done 710 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 711 goto foundBest; 712 } 713 714 // See if any of the possible second words is followed by a third word 715 do { 716 // If we find a third word, stop right away 717 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 718 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent(); 719 goto foundBest; 720 } 721 } 722 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text)); 723 } 724 } 725 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text)); 726 foundBest: 727 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 728 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 729 wordsFound += 1; 730 } 731 732 // We come here after having either found a word or not. We look ahead to the 733 // next word. If it's not a dictionary word, we will combine it with the word we 734 // just found (if there is one), but only if the preceding word does not exceed 735 // the threshold. 736 // The text iterator should now be positioned at the end of the word we found. 737 if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { 738 // if it is a dictionary word, do nothing. If it isn't, then if there is 739 // no preceding word, or the non-word shares less than the minimum threshold 740 // of characters with a dictionary word, then scan to resynchronize 741 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 742 && (cuWordLength == 0 743 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) { 744 // Look for a plausible word boundary 745 int32_t remaining = rangeEnd - (current + cuWordLength); 746 UChar32 pc; 747 UChar32 uc; 748 int32_t chars = 0; 749 for (;;) { 750 int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 751 pc = utext_next32(text); 752 int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex; 753 chars += pcSize; 754 remaining -= pcSize; 755 if (remaining <= 0) { 756 break; 757 } 758 uc = utext_current32(text); 759 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 760 // Maybe. See if it's in the dictionary. 761 // TODO: this looks iffy; compare with old code. 762 int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 763 utext_setNativeIndex(text, current + cuWordLength + chars); 764 if (num_candidates > 0) { 765 break; 766 } 767 } 768 } 769 770 // Bump the word count if there wasn't already one 771 if (cuWordLength <= 0) { 772 wordsFound += 1; 773 } 774 775 // Update the length with the passed-over characters 776 cuWordLength += chars; 777 } 778 else { 779 // Back up to where we were for next iteration 780 utext_setNativeIndex(text, current + cuWordLength); 781 } 782 } 783 784 // Never stop before a combining mark. 785 int32_t currPos; 786 while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 787 utext_next32(text); 788 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos; 789 } 790 791 // Look ahead for possible suffixes if a dictionary word does not follow. 792 // We do this in code rather than using a rule so that the heuristic 793 // resynch continues to function. For example, one of the suffix characters 794 // could be a typo in the middle of a word. 795 // NOT CURRENTLY APPLICABLE TO BURMESE 796 797 // Did we find a word on this iteration? If so, push it on the break stack 798 if (cuWordLength > 0) { 799 foundBreaks.push((current+cuWordLength), status); 800 } 801 } 802 803 // Don't return a break for the end of the dictionary range if there is one there. 804 if (foundBreaks.peeki() >= rangeEnd) { 805 (void) foundBreaks.popi(); 806 wordsFound -= 1; 807 } 808 809 return wordsFound; 810 } 811 812 /* 813 ****************************************************************** 814 * KhmerBreakEngine 815 */ 816 817 // How many words in a row are "good enough"? 818 static const int32_t KHMER_LOOKAHEAD = 3; 819 820 // Will not combine a non-word with a preceding dictionary word longer than this 821 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; 822 823 // Will not combine a non-word that shares at least this much prefix with a 824 // dictionary word, with a preceding word 825 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; 826 827 // Minimum word size 828 static const int32_t KHMER_MIN_WORD = 2; 829 830 // Minimum number of characters for two words 831 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; 832 833 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 834 : DictionaryBreakEngine(), 835 fDictionary(adoptDictionary) 836 { 837 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); 838 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); 839 UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); 840 if (U_SUCCESS(status)) { 841 setCharacters(khmerWordSet); 842 } 843 fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 844 fMarkSet.add(0x0020); 845 fEndWordSet = khmerWordSet; 846 fBeginWordSet.add(0x1780, 0x17B3); 847 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 848 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 849 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 850 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 851 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 852 // fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 853 // fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 854 // fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 855 // fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 856 // fSuffixSet.add(THAI_PAIYANNOI); 857 // fSuffixSet.add(THAI_MAIYAMOK); 858 859 // Compact for caching. 860 fMarkSet.compact(); 861 fEndWordSet.compact(); 862 fBeginWordSet.compact(); 863 // fSuffixSet.compact(); 864 UTRACE_EXIT_STATUS(status); 865 } 866 867 KhmerBreakEngine::~KhmerBreakEngine() { 868 delete fDictionary; 869 } 870 871 int32_t 872 KhmerBreakEngine::divideUpDictionaryRange( UText *text, 873 int32_t rangeStart, 874 int32_t rangeEnd, 875 UVector32 &foundBreaks, 876 UBool /* isPhraseBreaking */, 877 UErrorCode& status ) const { 878 if (U_FAILURE(status)) return 0; 879 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 880 return 0; // Not enough characters for two words 881 } 882 883 uint32_t wordsFound = 0; 884 int32_t cpWordLength = 0; 885 int32_t cuWordLength = 0; 886 int32_t current; 887 PossibleWord words[KHMER_LOOKAHEAD]; 888 889 utext_setNativeIndex(text, rangeStart); 890 891 while (U_SUCCESS(status) && (current = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd) { 892 cuWordLength = 0; 893 cpWordLength = 0; 894 895 // Look for candidate words at the current position 896 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 897 898 // If we found exactly one, use that 899 if (candidates == 1) { 900 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 901 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 902 wordsFound += 1; 903 } 904 905 // If there was more than one, see which one can take us forward the most words 906 else if (candidates > 1) { 907 // If we're already at the end of the range, we're done 908 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 909 goto foundBest; 910 } 911 do { 912 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 913 // Followed by another dictionary word; mark first word as a good candidate 914 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 915 916 // If we're already at the end of the range, we're done 917 if (static_cast<int32_t>(utext_getNativeIndex(text)) >= rangeEnd) { 918 goto foundBest; 919 } 920 921 // See if any of the possible second words is followed by a third word 922 do { 923 // If we find a third word, stop right away 924 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 925 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 926 goto foundBest; 927 } 928 } 929 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 930 } 931 } 932 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 933 foundBest: 934 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 935 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 936 wordsFound += 1; 937 } 938 939 // We come here after having either found a word or not. We look ahead to the 940 // next word. If it's not a dictionary word, we will combine it with the word we 941 // just found (if there is one), but only if the preceding word does not exceed 942 // the threshold. 943 // The text iterator should now be positioned at the end of the word we found. 944 if (static_cast<int32_t>(utext_getNativeIndex(text)) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 945 // if it is a dictionary word, do nothing. If it isn't, then if there is 946 // no preceding word, or the non-word shares less than the minimum threshold 947 // of characters with a dictionary word, then scan to resynchronize 948 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 949 && (cuWordLength == 0 950 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 951 // Look for a plausible word boundary 952 int32_t remaining = rangeEnd - (current+cuWordLength); 953 UChar32 pc; 954 UChar32 uc; 955 int32_t chars = 0; 956 for (;;) { 957 int32_t pcIndex = static_cast<int32_t>(utext_getNativeIndex(text)); 958 pc = utext_next32(text); 959 int32_t pcSize = static_cast<int32_t>(utext_getNativeIndex(text)) - pcIndex; 960 chars += pcSize; 961 remaining -= pcSize; 962 if (remaining <= 0) { 963 break; 964 } 965 uc = utext_current32(text); 966 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 967 // Maybe. See if it's in the dictionary. 968 int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 969 utext_setNativeIndex(text, current+cuWordLength+chars); 970 if (num_candidates > 0) { 971 break; 972 } 973 } 974 } 975 976 // Bump the word count if there wasn't already one 977 if (cuWordLength <= 0) { 978 wordsFound += 1; 979 } 980 981 // Update the length with the passed-over characters 982 cuWordLength += chars; 983 } 984 else { 985 // Back up to where we were for next iteration 986 utext_setNativeIndex(text, current+cuWordLength); 987 } 988 } 989 990 // Never stop before a combining mark. 991 int32_t currPos; 992 while ((currPos = static_cast<int32_t>(utext_getNativeIndex(text))) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 993 utext_next32(text); 994 cuWordLength += static_cast<int32_t>(utext_getNativeIndex(text)) - currPos; 995 } 996 997 // Look ahead for possible suffixes if a dictionary word does not follow. 998 // We do this in code rather than using a rule so that the heuristic 999 // resynch continues to function. For example, one of the suffix characters 1000 // could be a typo in the middle of a word. 1001 // if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1002 // if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1003 // && fSuffixSet.contains(uc = utext_current32(text))) { 1004 // if (uc == KHMER_PAIYANNOI) { 1005 // if (!fSuffixSet.contains(utext_previous32(text))) { 1006 // // Skip over previous end and PAIYANNOI 1007 // utext_next32(text); 1008 // utext_next32(text); 1009 // wordLength += 1; // Add PAIYANNOI to word 1010 // uc = utext_current32(text); // Fetch next character 1011 // } 1012 // else { 1013 // // Restore prior position 1014 // utext_next32(text); 1015 // } 1016 // } 1017 // if (uc == KHMER_MAIYAMOK) { 1018 // if (utext_previous32(text) != KHMER_MAIYAMOK) { 1019 // // Skip over previous end and MAIYAMOK 1020 // utext_next32(text); 1021 // utext_next32(text); 1022 // wordLength += 1; // Add MAIYAMOK to word 1023 // } 1024 // else { 1025 // // Restore prior position 1026 // utext_next32(text); 1027 // } 1028 // } 1029 // } 1030 // else { 1031 // utext_setNativeIndex(text, current+wordLength); 1032 // } 1033 // } 1034 1035 // Did we find a word on this iteration? If so, push it on the break stack 1036 if (cuWordLength > 0) { 1037 foundBreaks.push((current+cuWordLength), status); 1038 } 1039 } 1040 1041 // Don't return a break for the end of the dictionary range if there is one there. 1042 if (foundBreaks.peeki() >= rangeEnd) { 1043 (void) foundBreaks.popi(); 1044 wordsFound -= 1; 1045 } 1046 1047 return wordsFound; 1048 } 1049 1050 #if !UCONFIG_NO_NORMALIZATION 1051 /* 1052 ****************************************************************** 1053 * CjkBreakEngine 1054 */ 1055 static const uint32_t kuint32max = 0xFFFFFFFF; 1056 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1057 : DictionaryBreakEngine(), fDictionary(adoptDictionary), isCj(false) { 1058 UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); 1059 UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); 1060 fMlBreakEngine = nullptr; 1061 nfkcNorm2 = Normalizer2::getNFKCInstance(status); 1062 // Korean dictionary only includes Hangul syllables 1063 fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); 1064 fHangulWordSet.compact(); 1065 // Digits, open puncutation and Alphabetic characters. 1066 fDigitOrOpenPunctuationOrAlphabetSet.applyPattern( 1067 UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status); 1068 fDigitOrOpenPunctuationOrAlphabetSet.compact(); 1069 fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status); 1070 fClosePunctuationSet.compact(); 1071 1072 // handle Korean and Japanese/Chinese using different dictionaries 1073 if (type == kKorean) { 1074 if (U_SUCCESS(status)) { 1075 setCharacters(fHangulWordSet); 1076 } 1077 } else { // Chinese and Japanese 1078 UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); 1079 isCj = true; 1080 if (U_SUCCESS(status)) { 1081 setCharacters(cjSet); 1082 #if UCONFIG_USE_ML_PHRASE_BREAKING 1083 fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet, 1084 fClosePunctuationSet, status); 1085 if (fMlBreakEngine == nullptr) { 1086 status = U_MEMORY_ALLOCATION_ERROR; 1087 } 1088 #else 1089 initJapanesePhraseParameter(status); 1090 #endif 1091 } 1092 } 1093 UTRACE_EXIT_STATUS(status); 1094 } 1095 1096 CjkBreakEngine::~CjkBreakEngine(){ 1097 delete fDictionary; 1098 delete fMlBreakEngine; 1099 } 1100 1101 // The katakanaCost values below are based on the length frequencies of all 1102 // katakana phrases in the dictionary 1103 static const int32_t kMaxKatakanaLength = 8; 1104 static const int32_t kMaxKatakanaGroupLength = 20; 1105 static const uint32_t maxSnlp = 255; 1106 1107 static inline uint32_t getKatakanaCost(int32_t wordLength){ 1108 //TODO: fill array with actual values from dictionary! 1109 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1110 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 1111 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1112 } 1113 1114 static inline bool isKatakana(UChar32 value) { 1115 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 1116 (value >= 0xFF66 && value <= 0xFF9f); 1117 } 1118 1119 // Function for accessing internal utext flags. 1120 // Replicates an internal UText function. 1121 1122 static inline int32_t utext_i32_flag(int32_t bitIndex) { 1123 return static_cast<int32_t>(1) << bitIndex; 1124 } 1125 1126 /* 1127 * @param text A UText representing the text 1128 * @param rangeStart The start of the range of dictionary characters 1129 * @param rangeEnd The end of the range of dictionary characters 1130 * @param foundBreaks vector<int32> to receive the break positions 1131 * @return The number of breaks found 1132 */ 1133 int32_t 1134 CjkBreakEngine::divideUpDictionaryRange( UText *inText, 1135 int32_t rangeStart, 1136 int32_t rangeEnd, 1137 UVector32 &foundBreaks, 1138 UBool isPhraseBreaking, 1139 UErrorCode& status) const { 1140 if (U_FAILURE(status)) return 0; 1141 if (rangeStart >= rangeEnd) { 1142 return 0; 1143 } 1144 1145 // UnicodeString version of input UText, NFKC normalized if necessary. 1146 UnicodeString inString; 1147 1148 // inputMap[inStringIndex] = corresponding native index from UText inText. 1149 // If nullptr then mapping is 1:1 1150 LocalPointer<UVector32> inputMap; 1151 1152 // if UText has the input string as one contiguous UTF-16 chunk 1153 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) && 1154 inText->chunkNativeStart <= rangeStart && 1155 inText->chunkNativeLimit >= rangeEnd && 1156 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { 1157 1158 // Input UText is in one contiguous UTF-16 chunk. 1159 // Use Read-only aliasing UnicodeString. 1160 inString.setTo(false, 1161 inText->chunkContents + rangeStart - inText->chunkNativeStart, 1162 rangeEnd - rangeStart); 1163 } else { 1164 // Copy the text from the original inText (UText) to inString (UnicodeString). 1165 // Create a map from UnicodeString indices -> UText offsets. 1166 utext_setNativeIndex(inText, rangeStart); 1167 int32_t limit = rangeEnd; 1168 U_ASSERT(limit <= utext_nativeLength(inText)); 1169 if (limit > utext_nativeLength(inText)) { 1170 limit = static_cast<int32_t>(utext_nativeLength(inText)); 1171 } 1172 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1173 if (U_FAILURE(status)) { 1174 return 0; 1175 } 1176 while (utext_getNativeIndex(inText) < limit) { 1177 int32_t nativePosition = static_cast<int32_t>(utext_getNativeIndex(inText)); 1178 UChar32 c = utext_next32(inText); 1179 U_ASSERT(c != U_SENTINEL); 1180 inString.append(c); 1181 while (inputMap->size() < inString.length()) { 1182 inputMap->addElement(nativePosition, status); 1183 } 1184 } 1185 inputMap->addElement(limit, status); 1186 } 1187 1188 1189 if (!nfkcNorm2->isNormalized(inString, status)) { 1190 UnicodeString normalizedInput; 1191 // normalizedMap[normalizedInput position] == original UText position. 1192 LocalPointer<UVector32> normalizedMap(new UVector32(status), status); 1193 if (U_FAILURE(status)) { 1194 return 0; 1195 } 1196 1197 UnicodeString fragment; 1198 UnicodeString normalizedFragment; 1199 for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk 1200 fragment.remove(); 1201 int32_t fragmentStartI = srcI; 1202 UChar32 c = inString.char32At(srcI); 1203 for (;;) { 1204 fragment.append(c); 1205 srcI = inString.moveIndex32(srcI, 1); 1206 if (srcI == inString.length()) { 1207 break; 1208 } 1209 c = inString.char32At(srcI); 1210 if (nfkcNorm2->hasBoundaryBefore(c)) { 1211 break; 1212 } 1213 } 1214 nfkcNorm2->normalize(fragment, normalizedFragment, status); 1215 normalizedInput.append(normalizedFragment); 1216 1217 // Map every position in the normalized chunk to the start of the chunk 1218 // in the original input. 1219 int32_t fragmentOriginalStart = inputMap.isValid() ? 1220 inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart; 1221 while (normalizedMap->size() < normalizedInput.length()) { 1222 normalizedMap->addElement(fragmentOriginalStart, status); 1223 if (U_FAILURE(status)) { 1224 break; 1225 } 1226 } 1227 } 1228 U_ASSERT(normalizedMap->size() == normalizedInput.length()); 1229 int32_t nativeEnd = inputMap.isValid() ? 1230 inputMap->elementAti(inString.length()) : inString.length()+rangeStart; 1231 normalizedMap->addElement(nativeEnd, status); 1232 1233 inputMap = std::move(normalizedMap); 1234 inString = std::move(normalizedInput); 1235 } 1236 1237 int32_t numCodePts = inString.countChar32(); 1238 if (numCodePts != inString.length()) { 1239 // There are supplementary characters in the input. 1240 // The dictionary will produce boundary positions in terms of code point indexes, 1241 // not in terms of code unit string indexes. 1242 // Use the inputMap mechanism to take care of this in addition to indexing differences 1243 // from normalization and/or UTF-8 input. 1244 UBool hadExistingMap = inputMap.isValid(); 1245 if (!hadExistingMap) { 1246 inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status); 1247 if (U_FAILURE(status)) { 1248 return 0; 1249 } 1250 } 1251 int32_t cpIdx = 0; 1252 for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) { 1253 U_ASSERT(cuIdx >= cpIdx); 1254 if (hadExistingMap) { 1255 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); 1256 } else { 1257 inputMap->addElement(cuIdx+rangeStart, status); 1258 } 1259 cpIdx++; 1260 if (cuIdx == inString.length()) { 1261 break; 1262 } 1263 } 1264 } 1265 1266 #if UCONFIG_USE_ML_PHRASE_BREAKING 1267 // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja. 1268 if (isPhraseBreaking && isCj) { 1269 return fMlBreakEngine->divideUpRange(inText, rangeStart, rangeEnd, foundBreaks, inString, 1270 inputMap, status); 1271 } 1272 #endif 1273 1274 // bestSnlp[i] is the snlp of the best segmentation of the first i 1275 // code points in the range to be matched. 1276 UVector32 bestSnlp(numCodePts + 1, status); 1277 bestSnlp.addElement(0, status); 1278 for(int32_t i = 1; i <= numCodePts; i++) { 1279 bestSnlp.addElement(kuint32max, status); 1280 } 1281 1282 1283 // prev[i] is the index of the last CJK code point in the previous word in 1284 // the best segmentation of the first i characters. 1285 UVector32 prev(numCodePts + 1, status); 1286 for(int32_t i = 0; i <= numCodePts; i++){ 1287 prev.addElement(-1, status); 1288 } 1289 1290 const int32_t maxWordSize = 20; 1291 UVector32 values(numCodePts, status); 1292 values.setSize(numCodePts); 1293 UVector32 lengths(numCodePts, status); 1294 lengths.setSize(numCodePts); 1295 1296 UText fu = UTEXT_INITIALIZER; 1297 utext_openUnicodeString(&fu, &inString, &status); 1298 1299 // Dynamic programming to find the best segmentation. 1300 1301 // In outer loop, i is the code point index, 1302 // ix is the corresponding string (code unit) index. 1303 // They differ when the string contains supplementary characters. 1304 int32_t ix = 0; 1305 bool is_prev_katakana = false; 1306 for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) { 1307 if (static_cast<uint32_t>(bestSnlp.elementAti(i)) == kuint32max) { 1308 continue; 1309 } 1310 1311 int32_t count; 1312 utext_setNativeIndex(&fu, ix); 1313 count = fDictionary->matches(&fu, maxWordSize, numCodePts, 1314 nullptr, lengths.getBuffer(), values.getBuffer(), nullptr); 1315 // Note: lengths is filled with code point lengths 1316 // The nullptr parameter is the ignored code unit lengths. 1317 1318 // if there are no single character matches found in the dictionary 1319 // starting with this character, treat character as a 1-character word 1320 // with the highest value possible, i.e. the least likely to occur. 1321 // Exclude Korean characters from this treatment, as they should be left 1322 // together by default. 1323 if ((count == 0 || lengths.elementAti(0) != 1) && 1324 !fHangulWordSet.contains(inString.char32At(ix))) { 1325 values.setElementAt(maxSnlp, count); // 255 1326 lengths.setElementAt(1, count++); 1327 } 1328 1329 for (int32_t j = 0; j < count; j++) { 1330 uint32_t newSnlp = static_cast<uint32_t>(bestSnlp.elementAti(i)) + static_cast<uint32_t>(values.elementAti(j)); 1331 int32_t ln_j_i = lengths.elementAti(j) + i; 1332 if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(ln_j_i))) { 1333 bestSnlp.setElementAt(newSnlp, ln_j_i); 1334 prev.setElementAt(i, ln_j_i); 1335 } 1336 } 1337 1338 // In Japanese, 1339 // Katakana word in single character is pretty rare. So we apply 1340 // the following heuristic to Katakana: any continuous run of Katakana 1341 // characters is considered a candidate word with a default cost 1342 // specified in the katakanaCost table according to its length. 1343 1344 bool is_katakana = isKatakana(inString.char32At(ix)); 1345 int32_t katakanaRunLength = 1; 1346 if (!is_prev_katakana && is_katakana) { 1347 int32_t j = inString.moveIndex32(ix, 1); 1348 // Find the end of the continuous run of Katakana characters 1349 while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength && 1350 isKatakana(inString.char32At(j))) { 1351 j = inString.moveIndex32(j, 1); 1352 katakanaRunLength++; 1353 } 1354 if (katakanaRunLength < kMaxKatakanaGroupLength) { 1355 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength); 1356 if (newSnlp < static_cast<uint32_t>(bestSnlp.elementAti(i + katakanaRunLength))) { 1357 bestSnlp.setElementAt(newSnlp, i+katakanaRunLength); 1358 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; 1359 } 1360 } 1361 } 1362 is_prev_katakana = is_katakana; 1363 } 1364 utext_close(&fu); 1365 1366 // Start pushing the optimal offset index into t_boundary (t for tentative). 1367 // prev[numCodePts] is guaranteed to be meaningful. 1368 // We'll first push in the reverse order, i.e., 1369 // t_boundary[0] = numCodePts, and afterwards do a swap. 1370 UVector32 t_boundary(numCodePts+1, status); 1371 1372 int32_t numBreaks = 0; 1373 // No segmentation found, set boundary to end of range 1374 if (static_cast<uint32_t>(bestSnlp.elementAti(numCodePts)) == kuint32max) { 1375 t_boundary.addElement(numCodePts, status); 1376 numBreaks++; 1377 } else if (isPhraseBreaking) { 1378 t_boundary.addElement(numCodePts, status); 1379 if(U_SUCCESS(status)) { 1380 numBreaks++; 1381 int32_t prevIdx = numCodePts; 1382 1383 int32_t codeUnitIdx = -1; 1384 int32_t prevCodeUnitIdx = -1; 1385 int32_t length = -1; 1386 for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { 1387 codeUnitIdx = inString.moveIndex32(0, i); 1388 prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); 1389 // Calculate the length by using the code unit. 1390 length = prevCodeUnitIdx - codeUnitIdx; 1391 prevIdx = i; 1392 // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana 1393 // characters don't occur. 1394 if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) 1395 && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) 1396 || !isKatakana(inString.char32At(codeUnitIdx)))) { 1397 t_boundary.addElement(i, status); 1398 numBreaks++; 1399 } 1400 } 1401 } 1402 } else { 1403 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { 1404 t_boundary.addElement(i, status); 1405 numBreaks++; 1406 } 1407 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); 1408 } 1409 1410 // Add a break for the start of the dictionary range if there is not one 1411 // there already. 1412 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1413 t_boundary.addElement(0, status); 1414 numBreaks++; 1415 } 1416 1417 // Now that we're done, convert positions in t_boundary[] (indices in 1418 // the normalized input string) back to indices in the original input UText 1419 // while reversing t_boundary and pushing values to foundBreaks. 1420 int32_t prevCPPos = -1; 1421 int32_t prevUTextPos = -1; 1422 int32_t correctedNumBreaks = 0; 1423 for (int32_t i = numBreaks - 1; i >= 0; i--) { 1424 int32_t cpPos = t_boundary.elementAti(i); 1425 U_ASSERT(cpPos > prevCPPos); 1426 int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; 1427 U_ASSERT(utextPos >= prevUTextPos); 1428 if (utextPos > prevUTextPos) { 1429 // Boundaries are added to foundBreaks output in ascending order. 1430 U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); 1431 // In phrase breaking, there has to be a breakpoint between Cj character and close 1432 // punctuation. 1433 // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 1434 if (utextPos != rangeStart 1435 || (isPhraseBreaking && utextPos > 0 1436 && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { 1437 foundBreaks.push(utextPos, status); 1438 correctedNumBreaks++; 1439 } 1440 } else { 1441 // Normalization expanded the input text, the dictionary found a boundary 1442 // within the expansion, giving two boundaries with the same index in the 1443 // original text. Ignore the second. See ticket #12918. 1444 --numBreaks; 1445 } 1446 prevCPPos = cpPos; 1447 prevUTextPos = utextPos; 1448 } 1449 (void)prevCPPos; // suppress compiler warnings about unused variable 1450 1451 UChar32 nextChar = utext_char32At(inText, rangeEnd); 1452 if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { 1453 // In phrase breaking, there has to be a breakpoint between Cj character and 1454 // the number/open punctuation. 1455 // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 1456 // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 1457 // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U 1458 if (isPhraseBreaking) { 1459 if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { 1460 foundBreaks.popi(); 1461 correctedNumBreaks--; 1462 } 1463 } else { 1464 foundBreaks.popi(); 1465 correctedNumBreaks--; 1466 } 1467 } 1468 1469 // inString goes out of scope 1470 // inputMap goes out of scope 1471 return correctedNumBreaks; 1472 } 1473 1474 void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { 1475 loadJapaneseExtensions(error); 1476 loadHiragana(error); 1477 } 1478 1479 void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { 1480 const char* tag = "extensions"; 1481 ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error); 1482 if (U_SUCCESS(error)) { 1483 ResourceBundle bundle = ja.get(tag, error); 1484 while (U_SUCCESS(error) && bundle.hasNext()) { 1485 fSkipSet.puti(bundle.getNextString(error), 1, error); 1486 } 1487 } 1488 } 1489 1490 void CjkBreakEngine::loadHiragana(UErrorCode& error) { 1491 UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error); 1492 hiraganaWordSet.compact(); 1493 UnicodeSetIterator iterator(hiraganaWordSet); 1494 while (iterator.next()) { 1495 fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error); 1496 } 1497 } 1498 #endif 1499 1500 U_NAMESPACE_END 1501 1502 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */