filteredbrk.cpp (26203B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include "unicode/utypes.h" 11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION 12 13 #include "cmemory.h" 14 15 #include "unicode/filteredbrk.h" 16 #include "unicode/ucharstriebuilder.h" 17 #include "unicode/ures.h" 18 19 #include "uresimp.h" // ures_getByKeyWithFallback 20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR 21 #include "uvector.h" 22 #include "cmemory.h" 23 #include "umutex.h" 24 25 U_NAMESPACE_BEGIN 26 27 #ifndef FB_DEBUG 28 #define FB_DEBUG 0 29 #endif 30 31 #if FB_DEBUG 32 #include <stdio.h> 33 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) { 34 char buf[2048]; 35 if(s) { 36 s->extract(0,s->length(),buf,2048); 37 } else { 38 strcpy(buf,"nullptr"); 39 } 40 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n", 41 f, l, m, buf, (const void*)s, b?'T':'F',(int)d); 42 } 43 44 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__) 45 #else 46 #define FB_TRACE(m,s,b,d) 47 #endif 48 49 /** 50 * Used with sortedInsert() 51 */ 52 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 53 const UnicodeString& a = *static_cast<const UnicodeString*>(t1.pointer); 54 const UnicodeString& b = *static_cast<const UnicodeString*>(t2.pointer); 55 return a.compare(b); 56 } 57 58 /** 59 * A UVector which implements a set of strings. 60 */ 61 class UStringSet : public UVector { 62 public: 63 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject, 64 uhash_compareUnicodeString, 65 1, 66 status) {} 67 virtual ~UStringSet(); 68 /** 69 * Is this UnicodeSet contained? 70 */ 71 inline UBool contains(const UnicodeString& s) { 72 return contains((void*) &s); 73 } 74 using UVector::contains; 75 /** 76 * Return the ith UnicodeString alias 77 */ 78 inline const UnicodeString* getStringAt(int32_t i) const { 79 return static_cast<const UnicodeString*>(elementAt(i)); 80 } 81 /** 82 * Adopt the UnicodeString if not already contained. 83 * Caller no longer owns the pointer in any case. 84 * @return true if adopted successfully, false otherwise (error, or else duplicate) 85 */ 86 inline UBool adopt(UnicodeString *str, UErrorCode &status) { 87 if(U_FAILURE(status) || contains(*str)) { 88 delete str; 89 return false; 90 } else { 91 sortedInsert(str, compareUnicodeString, status); 92 if(U_FAILURE(status)) { 93 return false; 94 } 95 return true; 96 } 97 } 98 /** 99 * Add by value. 100 * @return true if successfully adopted. 101 */ 102 inline UBool add(const UnicodeString& str, UErrorCode &status) { 103 if(U_FAILURE(status)) return false; 104 UnicodeString *t = new UnicodeString(str); 105 if(t==nullptr) { 106 status = U_MEMORY_ALLOCATION_ERROR; return false; 107 } 108 return adopt(t, status); 109 } 110 /** 111 * Remove this string. 112 * @return true if successfully removed, false otherwise (error, or else it wasn't there) 113 */ 114 inline UBool remove(const UnicodeString &s, UErrorCode &status) { 115 if(U_FAILURE(status)) return false; 116 return removeElement((void*) &s); 117 } 118 }; 119 120 /** 121 * Virtual, won't be inlined 122 */ 123 UStringSet::~UStringSet() {} 124 125 /* ----------------------------------------------------------- */ 126 127 128 /* Filtered Break constants */ 129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie 130 static const int32_t kMATCH = (1<<1); //< exact match - skip this one. 131 static const int32_t kSuppressInReverse = (1<<0); 132 static const int32_t kAddToForward = (1<<1); 133 static const char16_t kFULLSTOP = 0x002E; // '.' 134 135 /** 136 * Shared data for SimpleFilteredSentenceBreakIterator 137 */ 138 class SimpleFilteredSentenceBreakData : public UMemory { 139 public: 140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards ) 141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { } 142 SimpleFilteredSentenceBreakData *incr() { 143 umtx_atomic_inc(&refcount); 144 return this; 145 } 146 SimpleFilteredSentenceBreakData *decr() { 147 if(umtx_atomic_dec(&refcount) <= 0) { 148 delete this; 149 } 150 return nullptr; 151 } 152 virtual ~SimpleFilteredSentenceBreakData(); 153 154 bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); } 155 bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); } 156 157 const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; } 158 const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; } 159 160 private: 161 // These tries own their data arrays. 162 // They are shared and must therefore not be modified. 163 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M." 164 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs. 165 u_atomic_int32_t refcount; 166 }; 167 168 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {} 169 170 /** 171 * Concrete implementation 172 */ 173 class SimpleFilteredSentenceBreakIterator : public BreakIterator { 174 public: 175 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); 176 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other); 177 virtual ~SimpleFilteredSentenceBreakIterator(); 178 private: 179 SimpleFilteredSentenceBreakData *fData; 180 LocalPointer<BreakIterator> fDelegate; 181 LocalUTextPointer fText; 182 183 /* -- subclass interface -- */ 184 public: 185 /* -- cloning and other subclass stuff -- */ 186 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/, 187 int32_t &/*BufferSize*/, 188 UErrorCode &status) override { 189 // for now - always deep clone 190 status = U_SAFECLONE_ALLOCATED_WARNING; 191 return clone(); 192 } 193 virtual SimpleFilteredSentenceBreakIterator* clone() const override { return new SimpleFilteredSentenceBreakIterator(*this); } 194 virtual UClassID getDynamicClassID() const override { return nullptr; } 195 virtual bool operator==(const BreakIterator& o) const override { if(this==&o) return true; return false; } 196 197 /* -- text modifying -- */ 198 virtual void setText(UText *text, UErrorCode &status) override { fDelegate->setText(text,status); } 199 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) override { fDelegate->refreshInputText(input,status); return *this; } 200 virtual void adoptText(CharacterIterator* it) override { fDelegate->adoptText(it); } 201 virtual void setText(const UnicodeString &text) override { fDelegate->setText(text); } 202 203 /* -- other functions that are just delegated -- */ 204 virtual UText *getUText(UText *fillIn, UErrorCode &status) const override { return fDelegate->getUText(fillIn,status); } 205 virtual CharacterIterator& getText() const override { return fDelegate->getText(); } 206 207 /* -- ITERATION -- */ 208 virtual int32_t first() override; 209 virtual int32_t preceding(int32_t offset) override; 210 virtual int32_t previous() override; 211 virtual UBool isBoundary(int32_t offset) override; 212 virtual int32_t current() const override { return fDelegate->current(); } // we keep the delegate current, so this should be correct. 213 214 virtual int32_t next() override; 215 216 virtual int32_t next(int32_t n) override; 217 virtual int32_t following(int32_t offset) override; 218 virtual int32_t last() override; 219 220 private: 221 /** 222 * Given that the fDelegate has already given its "initial" answer, 223 * find the NEXT actual (non-excepted) break. 224 * @param n initial position from delegate 225 * @return new break position or UBRK_DONE 226 */ 227 int32_t internalNext(int32_t n); 228 /** 229 * Given that the fDelegate has already given its "initial" answer, 230 * find the PREV actual (non-excepted) break. 231 * @param n initial position from delegate 232 * @return new break position or UBRK_DONE 233 */ 234 int32_t internalPrev(int32_t n); 235 /** 236 * set up the UText with the value of the fDelegate. 237 * Call this before calling breakExceptionAt. 238 * May be able to avoid excess calls 239 */ 240 void resetState(UErrorCode &status); 241 /** 242 * Is there a match (exception) at this spot? 243 */ 244 enum EFBMatchResult { kNoExceptionHere, kExceptionHere }; 245 /** 246 * Determine if there is an exception at this spot 247 * @param n spot to check 248 * @return kNoExceptionHere or kExceptionHere 249 **/ 250 enum EFBMatchResult breakExceptionAt(int32_t n); 251 }; 252 253 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other) 254 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone()) 255 { 256 } 257 258 259 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : 260 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), 261 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)), 262 fDelegate(adopt) 263 { 264 if (fData == nullptr) { 265 delete forwards; 266 delete backwards; 267 if (U_SUCCESS(status)) { 268 status = U_MEMORY_ALLOCATION_ERROR; 269 } 270 } 271 } 272 273 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() { 274 fData = fData->decr(); 275 } 276 277 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) { 278 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status)); 279 } 280 281 SimpleFilteredSentenceBreakIterator::EFBMatchResult 282 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) { 283 int64_t bestPosn = -1; 284 int32_t bestValue = -1; 285 // loops while 'n' points to an exception. 286 utext_setNativeIndex(fText.getAlias(), n); // from n.. 287 288 //if(debug2) u_printf(" n@ %d\n", n); 289 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 290 if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here?? 291 // TODO only do this the 1st time? 292 //if(debug2) u_printf("skipping prev: |%C| \n", (char16_t)uch); 293 } else { 294 //if(debug2) u_printf("not skipping prev: |%C| \n", (char16_t)uch); 295 utext_next32(fText.getAlias()); 296 //if(debug2) u_printf(" -> : |%C| \n", (char16_t)uch); 297 } 298 299 { 300 // Do not modify the shared trie! 301 UCharsTrie iter(fData->getBackwardsTrie()); 302 UChar32 uch; 303 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards 304 UStringTrieResult r = iter.nextForCodePoint(uch); 305 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far 306 bestPosn = utext_getNativeIndex(fText.getAlias()); 307 bestValue = iter.getValue(); 308 } 309 if(!USTRINGTRIE_HAS_NEXT(r)) { 310 break; 311 } 312 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (char16_t)uch, r, utext_getNativeIndex(fText.getAlias())); 313 } 314 } 315 316 //if(bestValue >= 0) { 317 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); 318 //} 319 320 if(bestPosn>=0) { 321 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (char16_t)uch, r, bestPosn, bestValue); 322 323 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what? 324 //int32_t bestValue = iter.getValue(); 325 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (char16_t)uch, r, bestValue); 326 327 if(bestValue == kMATCH) { // exact match! 328 //if(debug2) u_printf(" exact backward match\n"); 329 return kExceptionHere; // See if the next is another exception. 330 } else if(bestValue == kPARTIAL 331 && fData->hasForwardsPartialTrie()) { // make sure there's a forward trie 332 //if(debug2) u_printf(" partial backward match\n"); 333 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 334 // to see if it matches something going forward. 335 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE; 336 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close .. 337 //if(debug2) u_printf("Retrying at %d\n", bestPosn); 338 // Do not modify the shared trie! 339 UCharsTrie iter(fData->getForwardsPartialTrie()); 340 UChar32 uch; 341 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL && 342 USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) { 343 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (char16_t)uch, rfwd, utext_getNativeIndex(fText.getAlias())); 344 } 345 if(USTRINGTRIE_MATCHES(rfwd)) { 346 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (char16_t)uch); 347 // only full matches here, nothing to check 348 // skip the next: 349 return kExceptionHere; 350 } else { 351 //if(debug2) u_printf("fwd> /%C/ no match.\n", (char16_t)uch); 352 // no match (no exception) -return the 'underlying' break 353 return kNoExceptionHere; 354 } 355 } else { 356 return kNoExceptionHere; // internal error and/or no forwards trie 357 } 358 } else { 359 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (char16_t)uch, r); // no best match 360 return kNoExceptionHere; // No match - so exit. Not an exception. 361 } 362 } 363 364 // the workhorse single next. 365 int32_t 366 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) { 367 if(n == UBRK_DONE || // at end or 368 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions 369 return n; 370 } 371 // OK, do we need to break here? 372 UErrorCode status = U_ZERO_ERROR; 373 // refresh text 374 resetState(status); 375 if(U_FAILURE(status)) return UBRK_DONE; // bail out 376 int64_t utextLen = utext_nativeLength(fText.getAlias()); 377 378 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 379 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate). 380 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 381 382 switch(m) { 383 case kExceptionHere: 384 n = fDelegate->next(); // skip this one. Find the next lowerlevel break. 385 continue; 386 387 default: 388 case kNoExceptionHere: 389 return n; 390 } 391 } 392 return n; 393 } 394 395 int32_t 396 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) { 397 if(n == 0 || n == UBRK_DONE || // at end or 398 !fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions 399 return n; 400 } 401 // OK, do we need to break here? 402 UErrorCode status = U_ZERO_ERROR; 403 // refresh text 404 resetState(status); 405 if(U_FAILURE(status)) return UBRK_DONE; // bail out 406 407 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias())); 408 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate). 409 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n); 410 411 switch(m) { 412 case kExceptionHere: 413 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break. 414 continue; 415 416 default: 417 case kNoExceptionHere: 418 return n; 419 } 420 } 421 return n; 422 } 423 424 425 int32_t 426 SimpleFilteredSentenceBreakIterator::next() { 427 return internalNext(fDelegate->next()); 428 } 429 430 int32_t 431 SimpleFilteredSentenceBreakIterator::first() { 432 // Don't suppress a break opportunity at the beginning of text. 433 return fDelegate->first(); 434 } 435 436 int32_t 437 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) { 438 return internalPrev(fDelegate->preceding(offset)); 439 } 440 441 int32_t 442 SimpleFilteredSentenceBreakIterator::previous() { 443 return internalPrev(fDelegate->previous()); 444 } 445 446 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) { 447 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress 448 449 if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions 450 451 UErrorCode status = U_ZERO_ERROR; 452 resetState(status); 453 454 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset); 455 456 switch(m) { 457 case kExceptionHere: 458 return false; 459 default: 460 case kNoExceptionHere: 461 return true; 462 } 463 } 464 465 int32_t 466 SimpleFilteredSentenceBreakIterator::next(int32_t offset) { 467 return internalNext(fDelegate->next(offset)); 468 } 469 470 int32_t 471 SimpleFilteredSentenceBreakIterator::following(int32_t offset) { 472 return internalNext(fDelegate->following(offset)); 473 } 474 475 int32_t 476 SimpleFilteredSentenceBreakIterator::last() { 477 // Don't suppress a break opportunity at the end of text. 478 return fDelegate->last(); 479 } 480 481 482 /** 483 * Concrete implementation of builder class. 484 */ 485 class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { 486 public: 487 virtual ~SimpleFilteredBreakIteratorBuilder(); 488 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); 489 SimpleFilteredBreakIteratorBuilder(UErrorCode &status); 490 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; 491 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) override; 492 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) override; 493 private: 494 UStringSet fSet; 495 }; 496 497 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() 498 { 499 } 500 501 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status) 502 : fSet(status) 503 { 504 } 505 506 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) 507 : fSet(status) 508 { 509 if(U_SUCCESS(status)) { 510 UErrorCode subStatus = U_ZERO_ERROR; 511 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus)); 512 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 513 status = subStatus; // copy the failing status 514 #if FB_DEBUG 515 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 516 #endif 517 return; // leaves the builder empty, if you try to use it. 518 } 519 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", nullptr, &subStatus)); 520 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 521 status = subStatus; // copy the failing status 522 #if FB_DEBUG 523 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 524 #endif 525 return; // leaves the builder empty, if you try to use it. 526 } 527 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", nullptr, &subStatus)); 528 529 #if FB_DEBUG 530 { 531 UErrorCode subsub = subStatus; 532 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus)); 533 } 534 #endif 535 536 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) { 537 status = subStatus; // copy the failing status 538 #if FB_DEBUG 539 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status)); 540 #endif 541 return; // leaves the builder empty, if you try to use it. 542 } 543 544 LocalUResourceBundlePointer strs; 545 subStatus = status; // Pick up inherited warning status now 546 do { 547 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus)); 548 if(strs.isValid() && U_SUCCESS(subStatus)) { 549 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status)); 550 suppressBreakAfter(str, status); // load the string 551 } 552 } while (strs.isValid() && U_SUCCESS(subStatus)); 553 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) { 554 status = subStatus; 555 } 556 } 557 } 558 559 UBool 560 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 561 { 562 UBool r = fSet.add(exception, status); 563 FB_TRACE("suppressBreakAfter",&exception,r,0); 564 return r; 565 } 566 567 UBool 568 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) 569 { 570 UBool r = fSet.remove(exception, status); 571 FB_TRACE("unsuppressBreakAfter",&exception,r,0); 572 return r; 573 } 574 575 /** 576 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly. 577 * Work around this. 578 * 579 * Note: "new UnicodeString[subCount]" ends up calling global operator new 580 * on MSVC2012 for some reason. 581 */ 582 static inline UnicodeString* newUnicodeStringArray(size_t count) { 583 return new UnicodeString[count ? count : 1]; 584 } 585 586 BreakIterator * 587 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) { 588 LocalPointer<BreakIterator> adopt(adoptBreakIterator); 589 590 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status); 591 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status); 592 if(U_FAILURE(status)) { 593 return nullptr; 594 } 595 596 int32_t revCount = 0; 597 int32_t fwdCount = 0; 598 599 int32_t subCount = fSet.size(); 600 601 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount); 602 603 LocalArray<UnicodeString> ustrs(ustrs_ptr); 604 605 LocalMemory<int> partials; 606 partials.allocateInsteadAndReset(subCount); 607 608 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs. 609 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." 610 611 int n=0; 612 for ( int32_t i = 0; 613 i<fSet.size(); 614 i++) { 615 const UnicodeString *abbr = fSet.getStringAt(i); 616 if(abbr) { 617 FB_TRACE("build",abbr,true,i); 618 ustrs[n] = *abbr; // copy by value 619 FB_TRACE("ustrs[n]",&ustrs[n],true,i); 620 } else { 621 FB_TRACE("build",abbr,false,i); 622 status = U_MEMORY_ALLOCATION_ERROR; 623 return nullptr; 624 } 625 partials[n] = 0; // default: not partial 626 n++; 627 } 628 // first pass - find partials. 629 for(int i=0;i<subCount;i++) { 630 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations 631 if(nn>-1 && (nn+1)!=ustrs[i].length()) { 632 FB_TRACE("partial",&ustrs[i],false,i); 633 // is partial. 634 // is it unique? 635 int sameAs = -1; 636 for(int j=0;j<subCount;j++) { 637 if(j==i) continue; 638 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { 639 FB_TRACE("prefix",&ustrs[j],false,nn+1); 640 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn 641 if(partials[j]==0) { // hasn't been processed yet 642 partials[j] = kSuppressInReverse | kAddToForward; 643 FB_TRACE("suppressing",&ustrs[j],false,j); 644 } else if(partials[j] & kSuppressInReverse) { 645 sameAs = j; // the other entry is already in the reverse table. 646 } 647 } 648 } 649 FB_TRACE("for partial same-",&ustrs[i],false,sameAs); 650 FB_TRACE(" == partial #",&ustrs[i],false,partials[i]); 651 UnicodeString prefix(ustrs[i], 0, nn+1); 652 if(sameAs == -1 && partials[i] == 0) { 653 // first one - add the prefix to the reverse table. 654 prefix.reverse(); 655 builder->add(prefix, kPARTIAL, status); 656 revCount++; 657 FB_TRACE("Added partial",&prefix,false, i); 658 FB_TRACE(u_errorName(status),&ustrs[i],false,i); 659 partials[i] = kSuppressInReverse | kAddToForward; 660 } else { 661 FB_TRACE("NOT adding partial",&prefix,false, i); 662 FB_TRACE(u_errorName(status),&ustrs[i],false,i); 663 } 664 } 665 } 666 for(int i=0;i<subCount;i++) { 667 if(partials[i]==0) { 668 ustrs[i].reverse(); 669 builder->add(ustrs[i], kMATCH, status); 670 revCount++; 671 FB_TRACE(u_errorName(status), &ustrs[i], false, i); 672 } else { 673 FB_TRACE("Adding fwd",&ustrs[i], false, i); 674 675 // an optimization would be to only add the portion after the '.' 676 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, 677 // instead of "Ph.D." since we already know the "Ph." part is a match. 678 // would need the trie to be able to hold 0-length strings, though. 679 builder2->add(ustrs[i], kMATCH, status); // forward 680 fwdCount++; 681 //ustrs[i].reverse(); 682 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); 683 } 684 } 685 FB_TRACE("AbbrCount",nullptr,false, subCount); 686 687 if(revCount>0) { 688 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); 689 if(U_FAILURE(status)) { 690 FB_TRACE(u_errorName(status),nullptr,false, -1); 691 return nullptr; 692 } 693 } 694 695 if(fwdCount>0) { 696 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); 697 if(U_FAILURE(status)) { 698 FB_TRACE(u_errorName(status),nullptr,false, -1); 699 return nullptr; 700 } 701 } 702 703 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); 704 } 705 706 707 // ----------- Base class implementation 708 709 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { 710 } 711 712 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() { 713 } 714 715 FilteredBreakIteratorBuilder * 716 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { 717 if(U_FAILURE(status)) return nullptr; 718 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status); 719 return (U_SUCCESS(status))? ret.orphan(): nullptr; 720 } 721 722 FilteredBreakIteratorBuilder * 723 FilteredBreakIteratorBuilder::createInstance(UErrorCode &status) { 724 return createEmptyInstance(status); 725 } 726 727 FilteredBreakIteratorBuilder * 728 FilteredBreakIteratorBuilder::createEmptyInstance(UErrorCode& status) { 729 if(U_FAILURE(status)) return nullptr; 730 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status); 731 return (U_SUCCESS(status))? ret.orphan(): nullptr; 732 } 733 734 U_NAMESPACE_END 735 736 #endif //#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION