usearch.cpp (91417B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2001-2015 IBM and others. All rights reserved. 6 ********************************************************************** 7 * Date Name Description 8 * 07/02/2001 synwee Creation. 9 ********************************************************************** 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 15 16 #include "unicode/usearch.h" 17 #include "unicode/ustring.h" 18 #include "unicode/uchar.h" 19 #include "unicode/utf16.h" 20 #include "normalizer2impl.h" 21 #include "usrchimp.h" 22 #include "cmemory.h" 23 #include "ucln_in.h" 24 #include "uassert.h" 25 #include "ustr_imp.h" 26 27 U_NAMESPACE_USE 28 29 // internal definition --------------------------------------------------- 30 31 #define LAST_BYTE_MASK_ 0xFF 32 #define SECOND_LAST_BYTE_SHIFT_ 8 33 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000 34 35 static const Normalizer2Impl *g_nfcImpl = nullptr; 36 37 // internal methods ------------------------------------------------- 38 39 /** 40 * Fast collation element iterator setOffset. 41 * This function does not check for bounds. 42 * @param coleiter collation element iterator 43 * @param offset to set 44 */ 45 static 46 inline void setColEIterOffset(UCollationElements *elems, 47 int32_t offset, 48 UErrorCode &status) 49 { 50 // Note: Not "fast" any more after the 2013 collation rewrite. 51 // We do not want to expose more internals than necessary. 52 ucol_setOffset(elems, offset, &status); 53 } 54 55 /** 56 * Getting the mask for collation strength 57 * @param strength collation strength 58 * @return collation element mask 59 */ 60 static 61 inline uint32_t getMask(UCollationStrength strength) 62 { 63 switch (strength) 64 { 65 case UCOL_PRIMARY: 66 return UCOL_PRIMARYORDERMASK; 67 case UCOL_SECONDARY: 68 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 69 default: 70 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 71 UCOL_PRIMARYORDERMASK; 72 } 73 } 74 75 U_CDECL_BEGIN 76 static UBool U_CALLCONV 77 usearch_cleanup() { 78 g_nfcImpl = nullptr; 79 return true; 80 } 81 U_CDECL_END 82 83 /** 84 * Initializing the fcd tables. 85 * Internal method, status assumed to be a success. 86 * @param status output error if any, caller to check status before calling 87 * method, status assumed to be success when passed in. 88 */ 89 static 90 inline void initializeFCD(UErrorCode *status) 91 { 92 if (g_nfcImpl == nullptr) { 93 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 94 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 95 } 96 } 97 98 /** 99 * Gets the fcd value for a character at the argument index. 100 * This method takes into accounts of the supplementary characters. 101 * @param str UTF16 string where character for fcd retrieval resides 102 * @param offset position of the character whose fcd is to be retrieved, to be 103 * overwritten with the next character position, taking 104 * surrogate characters into consideration. 105 * @param strlength length of the argument string 106 * @return fcd value 107 */ 108 static 109 uint16_t getFCD(const char16_t *str, int32_t *offset, 110 int32_t strlength) 111 { 112 const char16_t *temp = str + *offset; 113 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 114 *offset = static_cast<int32_t>(temp - str); 115 return result; 116 } 117 118 /** 119 * Getting the modified collation elements taking into account the collation 120 * attributes 121 * @param strsrch string search data 122 * @param sourcece 123 * @return the modified collation element 124 */ 125 static 126 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 127 { 128 // note for tertiary we can't use the collator->tertiaryMask, that 129 // is a preprocessed mask that takes into account case options. since 130 // we are only concerned with exact matches, we don't need that. 131 sourcece &= strsrch->ceMask; 132 133 if (strsrch->toShift) { 134 // alternate handling here, since only the 16 most significant digits 135 // is only used, we can safely do a compare without masking 136 // if the ce is a variable, we mask and get only the primary values 137 // no shifting to quartenary is required since all primary values 138 // less than variabletop will need to be masked off anyway. 139 if (strsrch->variableTop > sourcece) { 140 if (strsrch->strength >= UCOL_QUATERNARY) { 141 sourcece &= UCOL_PRIMARYORDERMASK; 142 } 143 else { 144 sourcece = UCOL_IGNORABLE; 145 } 146 } 147 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 148 sourcece = 0xFFFF; 149 } 150 151 return sourcece; 152 } 153 154 /** 155 * Adds a uint32_t value to a destination array. 156 * Creates a new array if we run out of space. The caller will have to 157 * manually deallocate the newly allocated array. 158 * destination not to be nullptr and has at least size destinationCapacity. 159 * @param destination target array 160 * @param destinationCapacity target array size, return value for the new size 161 * @param destOnHeap whether the destination array is heap-allocated 162 * @param offset destination offset to add value 163 * @param value to be added 164 * @param increments incremental size expected 165 * @param status output error if any 166 * @return new destination array, destination if there was no new allocation 167 */ 168 static 169 inline int32_t * addTouint32_tArray(int32_t *destination, 170 uint32_t *destinationCapacity, 171 bool destOnHeap, 172 uint32_t offset, 173 uint32_t value, 174 uint32_t increments, 175 UErrorCode *status) 176 { 177 if (U_FAILURE(*status)) { 178 return destination; 179 } 180 if (offset >= *destinationCapacity) { 181 uint32_t newlength = offset + increments; 182 int32_t* temp = static_cast<int32_t*>(uprv_malloc(sizeof(int32_t) * newlength)); 183 if (temp == nullptr) { 184 *status = U_MEMORY_ALLOCATION_ERROR; 185 return destination; 186 } 187 uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset); 188 if (destOnHeap) { 189 uprv_free(destination); 190 } 191 *destinationCapacity = newlength; 192 destination = temp; 193 } 194 destination[offset] = value; 195 return destination; 196 } 197 198 /** 199 * Adds a uint64_t value to a destination array. 200 * Creates a new array if we run out of space. The caller will have to 201 * manually deallocate the newly allocated array. 202 * destination not to be nullptr and has at least size destinationCapacity. 203 * @param destination target array 204 * @param destinationCapacity target array size, return value for the new size 205 * @param destOnHeap whether the destination array is heap-allocated 206 * @param offset destination offset to add value 207 * @param value to be added 208 * @param increments incremental size expected 209 * @param status output error if any 210 * @return new destination array, destination if there was no new allocation 211 */ 212 static 213 inline int64_t * addTouint64_tArray(int64_t *destination, 214 uint32_t *destinationCapacity, 215 bool destOnHeap, 216 uint32_t offset, 217 uint64_t value, 218 uint32_t increments, 219 UErrorCode *status) 220 { 221 if (U_FAILURE(*status)) { 222 return destination; 223 } 224 if (offset >= *destinationCapacity) { 225 uint32_t newlength = offset + increments; 226 int64_t* temp = static_cast<int64_t*>(uprv_malloc(sizeof(int64_t) * newlength)); 227 if (temp == nullptr) { 228 *status = U_MEMORY_ALLOCATION_ERROR; 229 return destination; 230 } 231 uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset); 232 if (destOnHeap) { 233 uprv_free(destination); 234 } 235 *destinationCapacity = newlength; 236 destination = temp; 237 } 238 destination[offset] = value; 239 return destination; 240 } 241 242 /** 243 * Initializing the ce table for a pattern. 244 * Stores non-ignorable collation keys. 245 * Table size will be estimated by the size of the pattern text. Table 246 * expansion will be perform as we go along. Adding 1 to ensure that the table 247 * size definitely increases. 248 * Internal method, status assumed to be a success. 249 * @param strsrch string search data 250 * @param status output error if any, caller to check status before calling 251 * method, status assumed to be success when passed in. 252 */ 253 static 254 inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status) 255 { 256 UPattern *pattern = &(strsrch->pattern); 257 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 258 int32_t *cetable = pattern->cesBuffer; 259 uint32_t patternlength = pattern->textLength; 260 UCollationElements *coleiter = strsrch->utilIter; 261 262 if (coleiter == nullptr) { 263 coleiter = ucol_openElements(strsrch->collator, pattern->text, 264 patternlength, status); 265 // status will be checked in ucol_next(..) later and if it is an 266 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 267 // returned. 268 strsrch->utilIter = coleiter; 269 } 270 else { 271 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 272 } 273 if(U_FAILURE(*status)) { 274 return; 275 } 276 277 if (pattern->ces != cetable && pattern->ces) { 278 uprv_free(pattern->ces); 279 } 280 281 uint32_t offset = 0; 282 int32_t ce; 283 284 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 285 U_SUCCESS(*status)) { 286 uint32_t newce = getCE(strsrch, ce); 287 if (newce) { 288 cetable = addTouint32_tArray( 289 cetable, &cetablesize, /* destOnHeap= */ cetable != pattern->cesBuffer, 290 offset, newce, patternlength - ucol_getOffset(coleiter) + 1, status); 291 offset ++; 292 } 293 } 294 295 cetable = addTouint32_tArray( 296 cetable, &cetablesize, /* destOnHeap= */ cetable != pattern->cesBuffer, 297 offset, 0, 1, status); 298 if (U_FAILURE(*status)) { 299 if (cetable != pattern->cesBuffer) { 300 uprv_free(cetable); 301 } 302 return; 303 } 304 pattern->ces = cetable; 305 pattern->cesLength = offset; 306 } 307 308 /** 309 * Initializing the pce table for a pattern. 310 * Stores non-ignorable collation keys. 311 * Table size will be estimated by the size of the pattern text. Table 312 * expansion will be perform as we go along. Adding 1 to ensure that the table 313 * size definitely increases. 314 * Internal method, status assumed to be a success. 315 * @param strsrch string search data 316 * @param status output error if any, caller to check status before calling 317 * method, status assumed to be success when passed in. 318 */ 319 static 320 inline void initializePatternPCETable(UStringSearch *strsrch, 321 UErrorCode *status) 322 { 323 UPattern *pattern = &(strsrch->pattern); 324 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 325 int64_t *pcetable = pattern->pcesBuffer; 326 uint32_t patternlength = pattern->textLength; 327 UCollationElements *coleiter = strsrch->utilIter; 328 329 if (coleiter == nullptr) { 330 coleiter = ucol_openElements(strsrch->collator, pattern->text, 331 patternlength, status); 332 // status will be checked in nextProcessed(..) later and if it is an error 333 // then UCOL_PROCESSED_NULLORDER is returned by nextProcessed(..), so 0 will be 334 // returned. 335 strsrch->utilIter = coleiter; 336 } else { 337 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 338 } 339 if(U_FAILURE(*status)) { 340 return; 341 } 342 343 if (pattern->pces != pcetable && pattern->pces != nullptr) { 344 uprv_free(pattern->pces); 345 } 346 347 uint32_t offset = 0; 348 int64_t pce; 349 350 icu::UCollationPCE iter(coleiter); 351 352 // ** Should processed CEs be signed or unsigned? 353 // ** (the rest of the code in this file seems to play fast-and-loose with 354 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 355 while ((pce = iter.nextProcessed(nullptr, nullptr, status)) != UCOL_PROCESSED_NULLORDER && 356 U_SUCCESS(*status)) { 357 pcetable = addTouint64_tArray( 358 pcetable, &pcetablesize, /* destOnHeap= */ pcetable != pattern->pcesBuffer, 359 offset, pce, patternlength - ucol_getOffset(coleiter) + 1, status); 360 offset += 1; 361 } 362 363 pcetable = addTouint64_tArray( 364 pcetable, &pcetablesize, /* destOnHeap= */ pcetable != pattern->pcesBuffer, 365 offset, 0, 1, status); 366 if (U_FAILURE(*status)) { 367 if (pcetable != pattern->pcesBuffer) { 368 uprv_free(pcetable); 369 } 370 return; 371 } 372 pattern->pces = pcetable; 373 pattern->pcesLength = offset; 374 } 375 376 /** 377 * Initializes the pattern struct. 378 * @param strsrch UStringSearch data storage 379 * @param status output error if any, caller to check status before calling 380 * method, status assumed to be success when passed in. 381 */ 382 static 383 inline void initializePattern(UStringSearch *strsrch, UErrorCode *status) 384 { 385 if (U_FAILURE(*status)) { return; } 386 387 UPattern *pattern = &(strsrch->pattern); 388 const char16_t *patterntext = pattern->text; 389 int32_t length = pattern->textLength; 390 int32_t index = 0; 391 392 // Since the strength is primary, accents are ignored in the pattern. 393 if (strsrch->strength == UCOL_PRIMARY) { 394 pattern->hasPrefixAccents = 0; 395 pattern->hasSuffixAccents = 0; 396 } else { 397 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 398 SECOND_LAST_BYTE_SHIFT_; 399 index = length; 400 U16_BACK_1(patterntext, 0, index); 401 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 402 LAST_BYTE_MASK_; 403 } 404 405 // ** HACK ** 406 if (strsrch->pattern.pces != nullptr) { 407 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 408 uprv_free(strsrch->pattern.pces); 409 } 410 411 strsrch->pattern.pces = nullptr; 412 } 413 414 initializePatternCETable(strsrch, status); 415 } 416 417 /** 418 * Initializes the pattern struct and builds the pattern collation element table. 419 * @param strsrch UStringSearch data storage 420 * @param status for output errors if it occurs, status is assumed to be a 421 * success when it is passed in. 422 */ 423 static 424 inline void initialize(UStringSearch *strsrch, UErrorCode *status) 425 { 426 initializePattern(strsrch, status); 427 } 428 429 #if !UCONFIG_NO_BREAK_ITERATION 430 // If the caller provided a character breakiterator we'll return that, 431 // otherwise we lazily create the internal break iterator. 432 static UBreakIterator* getBreakIterator(UStringSearch *strsrch, UErrorCode &status) 433 { 434 if (U_FAILURE(status)) { 435 return nullptr; 436 } 437 438 if (strsrch->search->breakIter != nullptr) { 439 return strsrch->search->breakIter; 440 } 441 442 if (strsrch->search->internalBreakIter != nullptr) { 443 return strsrch->search->internalBreakIter; 444 } 445 446 // Need to create the internal break iterator. 447 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, 448 ucol_getLocaleByType(strsrch->collator, ULOC_VALID_LOCALE, &status), 449 strsrch->search->text, strsrch->search->textLength, &status); 450 451 return strsrch->search->internalBreakIter; 452 } 453 #endif 454 455 /** 456 * Sets the match result to "not found", regardless of the incoming error status. 457 * If an error occurs while setting the result, it is reported back. 458 * 459 * @param strsrch string search data 460 * @param status for output errors, if they occur. 461 */ 462 static 463 inline void setMatchNotFound(UStringSearch *strsrch, UErrorCode &status) 464 { 465 UErrorCode localStatus = U_ZERO_ERROR; 466 467 strsrch->search->matchedIndex = USEARCH_DONE; 468 strsrch->search->matchedLength = 0; 469 if (strsrch->search->isForwardSearching) { 470 setColEIterOffset(strsrch->textIter, strsrch->search->textLength, localStatus); 471 } 472 else { 473 setColEIterOffset(strsrch->textIter, 0, localStatus); 474 } 475 476 // If an error occurred while setting the result to not found (ex: OOM), 477 // then we want to report that error back to the caller. 478 if (U_SUCCESS(status) && U_FAILURE(localStatus)) { 479 status = localStatus; 480 } 481 } 482 483 /** 484 * Checks if the offset runs out of the text string 485 * @param offset 486 * @param textlength of the text string 487 * @return true if offset is out of bounds, false otherwise 488 */ 489 static 490 inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 491 { 492 return offset < 0 || offset > textlength; 493 } 494 495 /** 496 * Checks for identical match 497 * @param strsrch string search data 498 * @param start offset of possible match 499 * @param end offset of possible match 500 * @return true if identical match is found 501 */ 502 static 503 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, int32_t end) 504 { 505 if (strsrch->strength != UCOL_IDENTICAL) { 506 return true; 507 } 508 509 // Note: We could use Normalizer::compare() or similar, but for short strings 510 // which may not be in FCD it might be faster to just NFD them. 511 UErrorCode status = U_ZERO_ERROR; 512 UnicodeString t2, p2; 513 strsrch->nfd->normalize( 514 UnicodeString(false, strsrch->search->text + start, end - start), t2, status); 515 strsrch->nfd->normalize( 516 UnicodeString(false, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 517 // return false if NFD failed 518 return U_SUCCESS(status) && t2 == p2; 519 } 520 521 // constructors and destructor ------------------------------------------- 522 523 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const char16_t *pattern, 524 int32_t patternlength, 525 const char16_t *text, 526 int32_t textlength, 527 const char *locale, 528 UBreakIterator *breakiter, 529 UErrorCode *status) 530 { 531 if (U_FAILURE(*status)) { 532 return nullptr; 533 } 534 #if UCONFIG_NO_BREAK_ITERATION 535 if (breakiter != nullptr) { 536 *status = U_UNSUPPORTED_ERROR; 537 return nullptr; 538 } 539 #endif 540 if (locale) { 541 // ucol_open internally checks for status 542 UCollator *collator = ucol_open(locale, status); 543 // pattern, text checks are done in usearch_openFromCollator 544 UStringSearch *result = usearch_openFromCollator(pattern, 545 patternlength, text, textlength, 546 collator, breakiter, status); 547 548 if (result == nullptr || U_FAILURE(*status)) { 549 if (collator) { 550 ucol_close(collator); 551 } 552 return nullptr; 553 } 554 else { 555 result->ownCollator = true; 556 } 557 return result; 558 } 559 *status = U_ILLEGAL_ARGUMENT_ERROR; 560 return nullptr; 561 } 562 563 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 564 const char16_t *pattern, 565 int32_t patternlength, 566 const char16_t *text, 567 int32_t textlength, 568 const UCollator *collator, 569 UBreakIterator *breakiter, 570 UErrorCode *status) 571 { 572 if (U_FAILURE(*status)) { 573 return nullptr; 574 } 575 #if UCONFIG_NO_BREAK_ITERATION 576 if (breakiter != nullptr) { 577 *status = U_UNSUPPORTED_ERROR; 578 return nullptr; 579 } 580 #endif 581 if (pattern == nullptr || text == nullptr || collator == nullptr) { 582 *status = U_ILLEGAL_ARGUMENT_ERROR; 583 return nullptr; 584 } 585 586 // string search does not really work when numeric collation is turned on 587 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 588 *status = U_UNSUPPORTED_ERROR; 589 return nullptr; 590 } 591 592 if (U_SUCCESS(*status)) { 593 initializeFCD(status); 594 if (U_FAILURE(*status)) { 595 return nullptr; 596 } 597 598 UStringSearch *result; 599 if (textlength == -1) { 600 textlength = u_strlen(text); 601 } 602 if (patternlength == -1) { 603 patternlength = u_strlen(pattern); 604 } 605 if (textlength <= 0 || patternlength <= 0) { 606 *status = U_ILLEGAL_ARGUMENT_ERROR; 607 return nullptr; 608 } 609 610 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 611 if (result == nullptr) { 612 *status = U_MEMORY_ALLOCATION_ERROR; 613 return nullptr; 614 } 615 616 result->collator = collator; 617 result->strength = ucol_getStrength(collator); 618 result->ceMask = getMask(result->strength); 619 result->toShift = 620 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 621 UCOL_SHIFTED; 622 result->variableTop = ucol_getVariableTop(collator, status); 623 624 result->nfd = Normalizer2::getNFDInstance(*status); 625 626 if (U_FAILURE(*status)) { 627 uprv_free(result); 628 return nullptr; 629 } 630 631 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 632 if (result->search == nullptr) { 633 *status = U_MEMORY_ALLOCATION_ERROR; 634 uprv_free(result); 635 return nullptr; 636 } 637 638 result->search->text = text; 639 result->search->textLength = textlength; 640 641 result->pattern.text = pattern; 642 result->pattern.textLength = patternlength; 643 result->pattern.ces = nullptr; 644 result->pattern.pces = nullptr; 645 646 result->search->breakIter = breakiter; 647 #if !UCONFIG_NO_BREAK_ITERATION 648 result->search->internalBreakIter = nullptr; // Lazily created. 649 if (breakiter) { 650 ubrk_setText(breakiter, text, textlength, status); 651 } 652 #endif 653 654 result->ownCollator = false; 655 result->search->matchedLength = 0; 656 result->search->matchedIndex = USEARCH_DONE; 657 result->utilIter = nullptr; 658 result->textIter = ucol_openElements(collator, text, 659 textlength, status); 660 result->textProcessedIter = nullptr; 661 if (U_FAILURE(*status)) { 662 usearch_close(result); 663 return nullptr; 664 } 665 666 result->search->isOverlap = false; 667 result->search->isCanonicalMatch = false; 668 result->search->elementComparisonType = 0; 669 result->search->isForwardSearching = true; 670 result->search->reset = true; 671 672 initialize(result, status); 673 674 if (U_FAILURE(*status)) { 675 usearch_close(result); 676 return nullptr; 677 } 678 679 return result; 680 } 681 return nullptr; 682 } 683 684 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 685 { 686 if (strsrch) { 687 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer && 688 strsrch->pattern.ces) { 689 uprv_free(strsrch->pattern.ces); 690 } 691 692 if (strsrch->pattern.pces != nullptr && 693 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) { 694 uprv_free(strsrch->pattern.pces); 695 } 696 697 delete strsrch->textProcessedIter; 698 ucol_closeElements(strsrch->textIter); 699 ucol_closeElements(strsrch->utilIter); 700 701 if (strsrch->ownCollator && strsrch->collator) { 702 ucol_close((UCollator *)strsrch->collator); 703 } 704 705 #if !UCONFIG_NO_BREAK_ITERATION 706 if (strsrch->search->internalBreakIter != nullptr) { 707 ubrk_close(strsrch->search->internalBreakIter); 708 } 709 #endif 710 711 uprv_free(strsrch->search); 712 uprv_free(strsrch); 713 } 714 } 715 716 namespace { 717 718 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 719 if (U_FAILURE(*status)) { return false; } 720 if (strsrch->textProcessedIter == nullptr) { 721 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 722 if (strsrch->textProcessedIter == nullptr) { 723 *status = U_MEMORY_ALLOCATION_ERROR; 724 return false; 725 } 726 } else { 727 strsrch->textProcessedIter->init(strsrch->textIter); 728 } 729 return true; 730 } 731 732 } 733 734 // set and get methods -------------------------------------------------- 735 736 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 737 int32_t position, 738 UErrorCode *status) 739 { 740 if (U_SUCCESS(*status) && strsrch) { 741 if (isOutOfBounds(strsrch->search->textLength, position)) { 742 *status = U_INDEX_OUTOFBOUNDS_ERROR; 743 } 744 else { 745 setColEIterOffset(strsrch->textIter, position, *status); 746 } 747 strsrch->search->matchedIndex = USEARCH_DONE; 748 strsrch->search->matchedLength = 0; 749 strsrch->search->reset = false; 750 } 751 } 752 753 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 754 { 755 if (strsrch) { 756 int32_t result = ucol_getOffset(strsrch->textIter); 757 if (isOutOfBounds(strsrch->search->textLength, result)) { 758 return USEARCH_DONE; 759 } 760 return result; 761 } 762 return USEARCH_DONE; 763 } 764 765 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 766 USearchAttribute attribute, 767 USearchAttributeValue value, 768 UErrorCode *status) 769 { 770 if (U_SUCCESS(*status) && strsrch) { 771 switch (attribute) 772 { 773 case USEARCH_OVERLAP : 774 strsrch->search->isOverlap = (value == USEARCH_ON ? true : false); 775 break; 776 case USEARCH_CANONICAL_MATCH : 777 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? true : 778 false); 779 break; 780 case USEARCH_ELEMENT_COMPARISON : 781 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 782 strsrch->search->elementComparisonType = (int16_t)value; 783 } else { 784 strsrch->search->elementComparisonType = 0; 785 } 786 break; 787 case USEARCH_ATTRIBUTE_COUNT : 788 default: 789 *status = U_ILLEGAL_ARGUMENT_ERROR; 790 } 791 } 792 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 793 *status = U_ILLEGAL_ARGUMENT_ERROR; 794 } 795 } 796 797 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 798 const UStringSearch *strsrch, 799 USearchAttribute attribute) 800 { 801 if (strsrch) { 802 switch (attribute) { 803 case USEARCH_OVERLAP : 804 return (strsrch->search->isOverlap ? USEARCH_ON : USEARCH_OFF); 805 case USEARCH_CANONICAL_MATCH : 806 return (strsrch->search->isCanonicalMatch ? USEARCH_ON : USEARCH_OFF); 807 case USEARCH_ELEMENT_COMPARISON : 808 { 809 int16_t value = strsrch->search->elementComparisonType; 810 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 811 return (USearchAttributeValue)value; 812 } else { 813 return USEARCH_STANDARD_ELEMENT_COMPARISON; 814 } 815 } 816 case USEARCH_ATTRIBUTE_COUNT : 817 return USEARCH_DEFAULT; 818 } 819 } 820 return USEARCH_DEFAULT; 821 } 822 823 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 824 const UStringSearch *strsrch) 825 { 826 if (strsrch == nullptr) { 827 return USEARCH_DONE; 828 } 829 return strsrch->search->matchedIndex; 830 } 831 832 833 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 834 char16_t *result, 835 int32_t resultCapacity, 836 UErrorCode *status) 837 { 838 if (U_FAILURE(*status)) { 839 return USEARCH_DONE; 840 } 841 if (strsrch == nullptr || resultCapacity < 0 || (resultCapacity > 0 && 842 result == nullptr)) { 843 *status = U_ILLEGAL_ARGUMENT_ERROR; 844 return USEARCH_DONE; 845 } 846 847 int32_t copylength = strsrch->search->matchedLength; 848 int32_t copyindex = strsrch->search->matchedIndex; 849 if (copyindex == USEARCH_DONE) { 850 u_terminateUChars(result, resultCapacity, 0, status); 851 return USEARCH_DONE; 852 } 853 854 if (resultCapacity < copylength) { 855 copylength = resultCapacity; 856 } 857 if (copylength > 0) { 858 uprv_memcpy(result, strsrch->search->text + copyindex, 859 copylength * sizeof(char16_t)); 860 } 861 return u_terminateUChars(result, resultCapacity, 862 strsrch->search->matchedLength, status); 863 } 864 865 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 866 const UStringSearch *strsrch) 867 { 868 if (strsrch) { 869 return strsrch->search->matchedLength; 870 } 871 return USEARCH_DONE; 872 } 873 874 #if !UCONFIG_NO_BREAK_ITERATION 875 876 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 877 UBreakIterator *breakiter, 878 UErrorCode *status) 879 { 880 if (U_SUCCESS(*status) && strsrch) { 881 strsrch->search->breakIter = breakiter; 882 if (breakiter) { 883 ubrk_setText(breakiter, strsrch->search->text, 884 strsrch->search->textLength, status); 885 } 886 } 887 } 888 889 U_CAPI const UBreakIterator* U_EXPORT2 890 usearch_getBreakIterator(const UStringSearch *strsrch) 891 { 892 if (strsrch) { 893 return strsrch->search->breakIter; 894 } 895 return nullptr; 896 } 897 898 #endif 899 900 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 901 const char16_t *text, 902 int32_t textlength, 903 UErrorCode *status) 904 { 905 if (U_SUCCESS(*status)) { 906 if (strsrch == nullptr || text == nullptr || textlength < -1 || 907 textlength == 0) { 908 *status = U_ILLEGAL_ARGUMENT_ERROR; 909 } 910 else { 911 if (textlength == -1) { 912 textlength = u_strlen(text); 913 } 914 strsrch->search->text = text; 915 strsrch->search->textLength = textlength; 916 ucol_setText(strsrch->textIter, text, textlength, status); 917 strsrch->search->matchedIndex = USEARCH_DONE; 918 strsrch->search->matchedLength = 0; 919 strsrch->search->reset = true; 920 #if !UCONFIG_NO_BREAK_ITERATION 921 if (strsrch->search->breakIter != nullptr) { 922 ubrk_setText(strsrch->search->breakIter, text, 923 textlength, status); 924 } 925 if (strsrch->search->internalBreakIter != nullptr) { 926 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 927 } 928 #endif 929 } 930 } 931 } 932 933 U_CAPI const char16_t * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 934 int32_t *length) 935 { 936 if (strsrch) { 937 *length = strsrch->search->textLength; 938 return strsrch->search->text; 939 } 940 return nullptr; 941 } 942 943 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 944 const UCollator *collator, 945 UErrorCode *status) 946 { 947 if (U_SUCCESS(*status)) { 948 if (collator == nullptr) { 949 *status = U_ILLEGAL_ARGUMENT_ERROR; 950 return; 951 } 952 953 if (strsrch) { 954 delete strsrch->textProcessedIter; 955 strsrch->textProcessedIter = nullptr; 956 ucol_closeElements(strsrch->textIter); 957 ucol_closeElements(strsrch->utilIter); 958 strsrch->textIter = strsrch->utilIter = nullptr; 959 if (strsrch->ownCollator && (strsrch->collator != collator)) { 960 ucol_close((UCollator *)strsrch->collator); 961 strsrch->ownCollator = false; 962 } 963 strsrch->collator = collator; 964 strsrch->strength = ucol_getStrength(collator); 965 strsrch->ceMask = getMask(strsrch->strength); 966 #if !UCONFIG_NO_BREAK_ITERATION 967 if (strsrch->search->internalBreakIter != nullptr) { 968 ubrk_close(strsrch->search->internalBreakIter); 969 strsrch->search->internalBreakIter = nullptr; // Lazily created. 970 } 971 #endif 972 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 973 strsrch->toShift = 974 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 975 UCOL_SHIFTED; 976 // if status is a failure, ucol_getVariableTop returns 0 977 strsrch->variableTop = ucol_getVariableTop(collator, status); 978 strsrch->textIter = ucol_openElements(collator, 979 strsrch->search->text, 980 strsrch->search->textLength, 981 status); 982 strsrch->utilIter = ucol_openElements( 983 collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 984 // initialize() _after_ setting the iterators for the new collator. 985 initialize(strsrch, status); 986 } 987 988 // **** are these calls needed? 989 // **** we call uprv_init_pce in initializePatternPCETable 990 // **** and the CEIBuffer constructor... 991 #if 0 992 uprv_init_pce(strsrch->textIter); 993 uprv_init_pce(strsrch->utilIter); 994 #endif 995 } 996 } 997 998 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 999 { 1000 if (strsrch) { 1001 return (UCollator *)strsrch->collator; 1002 } 1003 return nullptr; 1004 } 1005 1006 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 1007 const char16_t *pattern, 1008 int32_t patternlength, 1009 UErrorCode *status) 1010 { 1011 if (U_SUCCESS(*status)) { 1012 if (strsrch == nullptr || pattern == nullptr) { 1013 *status = U_ILLEGAL_ARGUMENT_ERROR; 1014 } 1015 else { 1016 if (patternlength == -1) { 1017 patternlength = u_strlen(pattern); 1018 } 1019 if (patternlength == 0) { 1020 *status = U_ILLEGAL_ARGUMENT_ERROR; 1021 return; 1022 } 1023 strsrch->pattern.text = pattern; 1024 strsrch->pattern.textLength = patternlength; 1025 initialize(strsrch, status); 1026 } 1027 } 1028 } 1029 1030 U_CAPI const char16_t* U_EXPORT2 1031 usearch_getPattern(const UStringSearch *strsrch, 1032 int32_t *length) 1033 { 1034 if (strsrch) { 1035 *length = strsrch->pattern.textLength; 1036 return strsrch->pattern.text; 1037 } 1038 return nullptr; 1039 } 1040 1041 // miscellaneous methods -------------------------------------------------- 1042 1043 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 1044 UErrorCode *status) 1045 { 1046 if (strsrch && U_SUCCESS(*status)) { 1047 strsrch->search->isForwardSearching = true; 1048 usearch_setOffset(strsrch, 0, status); 1049 if (U_SUCCESS(*status)) { 1050 return usearch_next(strsrch, status); 1051 } 1052 } 1053 return USEARCH_DONE; 1054 } 1055 1056 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 1057 int32_t position, 1058 UErrorCode *status) 1059 { 1060 if (strsrch && U_SUCCESS(*status)) { 1061 strsrch->search->isForwardSearching = true; 1062 // position checked in usearch_setOffset 1063 usearch_setOffset(strsrch, position, status); 1064 if (U_SUCCESS(*status)) { 1065 return usearch_next(strsrch, status); 1066 } 1067 } 1068 return USEARCH_DONE; 1069 } 1070 1071 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 1072 UErrorCode *status) 1073 { 1074 if (strsrch && U_SUCCESS(*status)) { 1075 strsrch->search->isForwardSearching = false; 1076 usearch_setOffset(strsrch, strsrch->search->textLength, status); 1077 if (U_SUCCESS(*status)) { 1078 return usearch_previous(strsrch, status); 1079 } 1080 } 1081 return USEARCH_DONE; 1082 } 1083 1084 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 1085 int32_t position, 1086 UErrorCode *status) 1087 { 1088 if (strsrch && U_SUCCESS(*status)) { 1089 strsrch->search->isForwardSearching = false; 1090 // position checked in usearch_setOffset 1091 usearch_setOffset(strsrch, position, status); 1092 if (U_SUCCESS(*status)) { 1093 return usearch_previous(strsrch, status); 1094 } 1095 } 1096 return USEARCH_DONE; 1097 } 1098 1099 /** 1100 * If a direction switch is required, we'll count the number of ces till the 1101 * beginning of the collation element iterator and iterate forwards that 1102 * number of times. This is so that we get to the correct point within the 1103 * string to continue the search in. Imagine when we are in the middle of the 1104 * normalization buffer when the change in direction is request. arrrgghh.... 1105 * After searching the offset within the collation element iterator will be 1106 * shifted to the start of the match. If a match is not found, the offset would 1107 * have been set to the end of the text string in the collation element 1108 * iterator. 1109 * Okay, here's my take on normalization buffer. The only time when there can 1110 * be 2 matches within the same normalization is when the pattern is consists 1111 * of all accents. But since the offset returned is from the text string, we 1112 * should not confuse the caller by returning the second match within the 1113 * same normalization buffer. If we do, the 2 results will have the same match 1114 * offsets, and that'll be confusing. I'll return the next match that doesn't 1115 * fall within the same normalization buffer. Note this does not affect the 1116 * results of matches spanning the text and the normalization buffer. 1117 * The position to start searching is taken from the collation element 1118 * iterator. Callers of this API would have to set the offset in the collation 1119 * element iterator before using this method. 1120 */ 1121 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 1122 UErrorCode *status) 1123 { 1124 if (U_SUCCESS(*status) && strsrch) { 1125 // note offset is either equivalent to the start of the previous match 1126 // or is set by the user 1127 int32_t offset = usearch_getOffset(strsrch); 1128 USearch *search = strsrch->search; 1129 search->reset = false; 1130 int32_t textlength = search->textLength; 1131 if (search->isForwardSearching) { 1132 if (offset == textlength || 1133 (! search->isOverlap && 1134 (search->matchedIndex != USEARCH_DONE && 1135 offset + search->matchedLength > textlength))) { 1136 // not enough characters to match 1137 setMatchNotFound(strsrch, *status); 1138 return USEARCH_DONE; 1139 } 1140 } 1141 else { 1142 // switching direction. 1143 // if matchedIndex == USEARCH_DONE, it means that either a 1144 // setOffset has been called or that previous ran off the text 1145 // string. the iterator would have been set to offset 0 if a 1146 // match is not found. 1147 search->isForwardSearching = true; 1148 if (search->matchedIndex != USEARCH_DONE) { 1149 // there's no need to set the collation element iterator 1150 // the next call to next will set the offset. 1151 return search->matchedIndex; 1152 } 1153 } 1154 1155 if (U_SUCCESS(*status)) { 1156 if (strsrch->pattern.cesLength == 0) { 1157 if (search->matchedIndex == USEARCH_DONE) { 1158 search->matchedIndex = offset; 1159 } 1160 else { // moves by codepoints 1161 U16_FWD_1(search->text, search->matchedIndex, textlength); 1162 } 1163 1164 search->matchedLength = 0; 1165 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status); 1166 // status checked below 1167 if (search->matchedIndex == textlength) { 1168 search->matchedIndex = USEARCH_DONE; 1169 } 1170 } 1171 else { 1172 if (search->matchedLength > 0) { 1173 // if matchlength is 0 we are at the start of the iteration 1174 if (search->isOverlap) { 1175 ucol_setOffset(strsrch->textIter, offset + 1, status); 1176 } 1177 else { 1178 ucol_setOffset(strsrch->textIter, 1179 offset + search->matchedLength, status); 1180 } 1181 } 1182 else { 1183 // for boundary check purposes. this will ensure that the 1184 // next match will not precede the current offset 1185 // note search->matchedIndex will always be set to something 1186 // in the code 1187 search->matchedIndex = offset - 1; 1188 } 1189 1190 if (search->isCanonicalMatch) { 1191 // can't use exact here since extra accents are allowed. 1192 usearch_handleNextCanonical(strsrch, status); 1193 } 1194 else { 1195 usearch_handleNextExact(strsrch, status); 1196 } 1197 } 1198 1199 if (U_FAILURE(*status)) { 1200 return USEARCH_DONE; 1201 } 1202 1203 if (search->matchedIndex == USEARCH_DONE) { 1204 ucol_setOffset(strsrch->textIter, search->textLength, status); 1205 } else { 1206 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 1207 } 1208 1209 return search->matchedIndex; 1210 } 1211 } 1212 return USEARCH_DONE; 1213 } 1214 1215 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 1216 UErrorCode *status) 1217 { 1218 if (U_SUCCESS(*status) && strsrch) { 1219 int32_t offset; 1220 USearch *search = strsrch->search; 1221 if (search->reset) { 1222 offset = search->textLength; 1223 search->isForwardSearching = false; 1224 search->reset = false; 1225 setColEIterOffset(strsrch->textIter, offset, *status); 1226 } 1227 else { 1228 offset = usearch_getOffset(strsrch); 1229 } 1230 1231 int32_t matchedindex = search->matchedIndex; 1232 if (search->isForwardSearching) { 1233 // switching direction. 1234 // if matchedIndex == USEARCH_DONE, it means that either a 1235 // setOffset has been called or that next ran off the text 1236 // string. the iterator would have been set to offset textLength if 1237 // a match is not found. 1238 search->isForwardSearching = false; 1239 if (matchedindex != USEARCH_DONE) { 1240 return matchedindex; 1241 } 1242 } 1243 else { 1244 1245 // Could check pattern length, but the 1246 // linear search will do the right thing 1247 if (offset == 0 || matchedindex == 0) { 1248 setMatchNotFound(strsrch, *status); 1249 return USEARCH_DONE; 1250 } 1251 } 1252 1253 if (U_SUCCESS(*status)) { 1254 if (strsrch->pattern.cesLength == 0) { 1255 search->matchedIndex = 1256 (matchedindex == USEARCH_DONE ? offset : matchedindex); 1257 if (search->matchedIndex == 0) { 1258 setMatchNotFound(strsrch, *status); 1259 // status checked below 1260 } 1261 else { // move by codepoints 1262 U16_BACK_1(search->text, 0, search->matchedIndex); 1263 setColEIterOffset(strsrch->textIter, search->matchedIndex, *status); 1264 // status checked below 1265 search->matchedLength = 0; 1266 } 1267 } 1268 else { 1269 if (strsrch->search->isCanonicalMatch) { 1270 // can't use exact here since extra accents are allowed. 1271 usearch_handlePreviousCanonical(strsrch, status); 1272 // status checked below 1273 } 1274 else { 1275 usearch_handlePreviousExact(strsrch, status); 1276 // status checked below 1277 } 1278 } 1279 1280 if (U_FAILURE(*status)) { 1281 return USEARCH_DONE; 1282 } 1283 1284 return search->matchedIndex; 1285 } 1286 } 1287 return USEARCH_DONE; 1288 } 1289 1290 1291 1292 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 1293 { 1294 /* 1295 reset is setting the attributes that are already in 1296 string search, hence all attributes in the collator should 1297 be retrieved without any problems 1298 */ 1299 if (strsrch) { 1300 UErrorCode status = U_ZERO_ERROR; 1301 UBool sameCollAttribute = true; 1302 uint32_t ceMask; 1303 UBool shift; 1304 uint32_t varTop; 1305 1306 // **** hack to deal w/ how processed CEs encode quaternary **** 1307 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 1308 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 1309 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 1310 sameCollAttribute = false; 1311 } 1312 1313 strsrch->strength = ucol_getStrength(strsrch->collator); 1314 ceMask = getMask(strsrch->strength); 1315 if (strsrch->ceMask != ceMask) { 1316 strsrch->ceMask = ceMask; 1317 sameCollAttribute = false; 1318 } 1319 1320 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 1321 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 1322 &status) == UCOL_SHIFTED; 1323 if (strsrch->toShift != shift) { 1324 strsrch->toShift = shift; 1325 sameCollAttribute = false; 1326 } 1327 1328 // if status is a failure, ucol_getVariableTop returns 0 1329 varTop = ucol_getVariableTop(strsrch->collator, &status); 1330 if (strsrch->variableTop != varTop) { 1331 strsrch->variableTop = varTop; 1332 sameCollAttribute = false; 1333 } 1334 if (!sameCollAttribute) { 1335 initialize(strsrch, &status); 1336 } 1337 ucol_setText(strsrch->textIter, strsrch->search->text, 1338 strsrch->search->textLength, 1339 &status); 1340 strsrch->search->matchedLength = 0; 1341 strsrch->search->matchedIndex = USEARCH_DONE; 1342 strsrch->search->isOverlap = false; 1343 strsrch->search->isCanonicalMatch = false; 1344 strsrch->search->elementComparisonType = 0; 1345 strsrch->search->isForwardSearching = true; 1346 strsrch->search->reset = true; 1347 } 1348 } 1349 1350 // 1351 // CEI Collation Element + source text index. 1352 // These structs are kept in the circular buffer. 1353 // 1354 struct CEI { 1355 int64_t ce; 1356 int32_t lowIndex; 1357 int32_t highIndex; 1358 }; 1359 1360 U_NAMESPACE_BEGIN 1361 1362 namespace { 1363 // 1364 // CEIBuffer A circular buffer of CEs-with-index from the text being searched. 1365 // 1366 #define DEFAULT_CEBUFFER_SIZE 96 1367 #define CEBUFFER_EXTRA 32 1368 // Some typical max values to make buffer size more reasonable for asymmetric search. 1369 // #8694 is for a better long-term solution to allocation of this buffer. 1370 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 1371 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 1372 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 1373 struct CEIBuffer { 1374 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 1375 CEI *buf; 1376 int32_t bufSize; 1377 int32_t firstIx; 1378 int32_t limitIx; 1379 UCollationElements *ceIter; 1380 UStringSearch *strSearch; 1381 1382 1383 1384 CEIBuffer(UStringSearch *ss, UErrorCode *status); 1385 ~CEIBuffer(); 1386 const CEI *get(int32_t index); 1387 const CEI *getPrevious(int32_t index); 1388 }; 1389 1390 1391 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) { 1392 buf = defBuf; 1393 strSearch = ss; 1394 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA; 1395 if (ss->search->elementComparisonType != 0) { 1396 const char16_t * patText = ss->pattern.text; 1397 if (patText) { 1398 const char16_t * patTextLimit = patText + ss->pattern.textLength; 1399 while ( patText < patTextLimit ) { 1400 char16_t c = *patText++; 1401 if (MIGHT_BE_JAMO_L(c)) { 1402 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 1403 } else { 1404 // No check for surrogates, we might allocate slightly more buffer than necessary. 1405 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 1406 } 1407 } 1408 } 1409 } 1410 ceIter = ss->textIter; 1411 firstIx = 0; 1412 limitIx = 0; 1413 1414 if (!initTextProcessedIter(ss, status)) { return; } 1415 1416 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 1417 buf = static_cast<CEI*>(uprv_malloc(bufSize * sizeof(CEI))); 1418 if (buf == nullptr) { 1419 *status = U_MEMORY_ALLOCATION_ERROR; 1420 } 1421 } 1422 } 1423 1424 // TODO: add a reset or init function so that allocated 1425 // buffers can be retained & reused. 1426 1427 CEIBuffer::~CEIBuffer() { 1428 if (buf != defBuf) { 1429 uprv_free(buf); 1430 } 1431 } 1432 1433 1434 // Get the CE with the specified index. 1435 // Index must be in the range 1436 // n-history_size < index < n+1 1437 // where n is the largest index to have been fetched by some previous call to this function. 1438 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1439 // 1440 const CEI *CEIBuffer::get(int32_t index) { 1441 int i = index % bufSize; 1442 1443 if (index>=firstIx && index<limitIx) { 1444 // The request was for an entry already in our buffer. 1445 // Just return it. 1446 return &buf[i]; 1447 } 1448 1449 // Caller is requesting a new, never accessed before, CE. 1450 // Verify that it is the next one in sequence, which is all 1451 // that is allowed. 1452 if (index != limitIx) { 1453 UPRV_UNREACHABLE_ASSERT; 1454 // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE, 1455 // which unconditionally called abort(). However, there were cases in which it 1456 // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70, 1457 // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation. 1458 // ICU-20792 tracks the follow-up work/further investigation on this. 1459 return nullptr; 1460 } 1461 1462 // Manage the circular CE buffer indexing 1463 limitIx++; 1464 1465 if (limitIx - firstIx >= bufSize) { 1466 // The buffer is full, knock out the lowest-indexed entry. 1467 firstIx++; 1468 } 1469 1470 UErrorCode status = U_ZERO_ERROR; 1471 1472 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 1473 1474 return &buf[i]; 1475 } 1476 1477 // Get the CE with the specified index. 1478 // Index must be in the range 1479 // n-history_size < index < n+1 1480 // where n is the largest index to have been fetched by some previous call to this function. 1481 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1482 // 1483 const CEI *CEIBuffer::getPrevious(int32_t index) { 1484 int i = index % bufSize; 1485 1486 if (index>=firstIx && index<limitIx) { 1487 // The request was for an entry already in our buffer. 1488 // Just return it. 1489 return &buf[i]; 1490 } 1491 1492 // Caller is requesting a new, never accessed before, CE. 1493 // Verify that it is the next one in sequence, which is all 1494 // that is allowed. 1495 if (index != limitIx) { 1496 UPRV_UNREACHABLE_ASSERT; 1497 // TODO: In ICU 64 the above was changed from U_ASSERT to UPRV_UNREACHABLE, 1498 // which unconditionally called abort(). However, there were cases in which it 1499 // was being hit, so it was changed back to U_ASSERT per ICU-20680. In ICU 70, 1500 // we now use the new UPRV_UNREACHABLE_ASSERT to better indicate the situation. 1501 // ICU-20792 tracks the follow-up work/further investigation on this. 1502 return nullptr; 1503 } 1504 1505 // Manage the circular CE buffer indexing 1506 limitIx++; 1507 1508 if (limitIx - firstIx >= bufSize) { 1509 // The buffer is full, knock out the lowest-indexed entry. 1510 firstIx++; 1511 } 1512 1513 UErrorCode status = U_ZERO_ERROR; 1514 1515 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 1516 1517 return &buf[i]; 1518 } 1519 1520 } 1521 1522 U_NAMESPACE_END 1523 1524 1525 // #define USEARCH_DEBUG 1526 1527 #ifdef USEARCH_DEBUG 1528 #include <stdio.h> 1529 #include <stdlib.h> 1530 #endif 1531 1532 /* 1533 * Find the next break boundary after startIndex. If the UStringSearch object 1534 * has an external break iterator, use that. Otherwise use the internal character 1535 * break iterator. 1536 */ 1537 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex, UErrorCode &status) { 1538 if (U_FAILURE(status)) { 1539 return startIndex; 1540 } 1541 #if 0 1542 const char16_t *text = strsrch->search->text; 1543 int32_t textLen = strsrch->search->textLength; 1544 1545 U_ASSERT(startIndex>=0); 1546 U_ASSERT(startIndex<=textLen); 1547 1548 if (startIndex >= textLen) { 1549 return startIndex; 1550 } 1551 1552 UChar32 c; 1553 int32_t i = startIndex; 1554 U16_NEXT(text, i, textLen, c); 1555 1556 // If we are on a control character, stop without looking for combining marks. 1557 // Control characters do not combine. 1558 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1559 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 1560 return i; 1561 } 1562 1563 // The initial character was not a control, and can thus accept trailing 1564 // combining characters. Advance over however many of them there are. 1565 int32_t indexOfLastCharChecked; 1566 for (;;) { 1567 indexOfLastCharChecked = i; 1568 if (i>=textLen) { 1569 break; 1570 } 1571 U16_NEXT(text, i, textLen, c); 1572 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1573 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1574 break; 1575 } 1576 } 1577 return indexOfLastCharChecked; 1578 #elif !UCONFIG_NO_BREAK_ITERATION 1579 UBreakIterator *breakiterator = getBreakIterator(strsrch, status); 1580 if (U_FAILURE(status)) { 1581 return startIndex; 1582 } 1583 1584 return ubrk_following(breakiterator, startIndex); 1585 #else 1586 // **** or should we use the original code? **** 1587 return startIndex; 1588 #endif 1589 1590 } 1591 1592 /* 1593 * Returns true if index is on a break boundary. If the UStringSearch 1594 * has an external break iterator, test using that, otherwise test 1595 * using the internal character break iterator. 1596 */ 1597 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index, UErrorCode &status) { 1598 if (U_FAILURE(status)) { 1599 return true; 1600 } 1601 #if 0 1602 const char16_t *text = strsrch->search->text; 1603 int32_t textLen = strsrch->search->textLength; 1604 1605 U_ASSERT(index>=0); 1606 U_ASSERT(index<=textLen); 1607 1608 if (index>=textLen || index<=0) { 1609 return true; 1610 } 1611 1612 // If the character at the current index is not a GRAPHEME_EXTEND 1613 // then we can not be within a combining sequence. 1614 UChar32 c; 1615 U16_GET(text, 0, index, textLen, c); 1616 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1617 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 1618 return true; 1619 } 1620 1621 // We are at a combining mark. If the preceding character is anything 1622 // except a CONTROL, CR or LF, we are in a combining sequence. 1623 U16_PREV(text, 0, index, c); 1624 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 1625 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 1626 return !combining; 1627 #elif !UCONFIG_NO_BREAK_ITERATION 1628 UBreakIterator *breakiterator = getBreakIterator(strsrch, status); 1629 if (U_FAILURE(status)) { 1630 return true; 1631 } 1632 1633 return ubrk_isBoundary(breakiterator, index); 1634 #else 1635 // **** or use the original code? **** 1636 return true; 1637 #endif 1638 } 1639 1640 #if 0 1641 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end, UErrorCode &status) 1642 { 1643 if (U_FAILURE(status)) { 1644 return true; 1645 } 1646 1647 #if !UCONFIG_NO_BREAK_ITERATION 1648 UBreakIterator *breakiterator = getBreakIterator(strsrch, status); 1649 if (U_SUCCESS(status)) { 1650 int32_t startindex = ubrk_first(breakiterator); 1651 int32_t endindex = ubrk_last(breakiterator); 1652 1653 // out-of-range indexes are never boundary positions 1654 if (start < startindex || start > endindex || 1655 end < startindex || end > endindex) { 1656 return false; 1657 } 1658 1659 return ubrk_isBoundary(breakiterator, start) && 1660 ubrk_isBoundary(breakiterator, end); 1661 } 1662 #endif 1663 1664 return true; 1665 } 1666 #endif 1667 1668 typedef enum { 1669 U_CE_MATCH = -1, 1670 U_CE_NO_MATCH = 0, 1671 U_CE_SKIP_TARG, 1672 U_CE_SKIP_PATN 1673 } UCompareCEsResult; 1674 #define U_CE_LEVEL2_BASE 0x00000005 1675 #define U_CE_LEVEL3_BASE 0x00050000 1676 1677 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 1678 if (targCE == patCE) { 1679 return U_CE_MATCH; 1680 } 1681 if (compareType == 0) { 1682 return U_CE_NO_MATCH; 1683 } 1684 1685 int64_t targCEshifted = targCE >> 32; 1686 int64_t patCEshifted = patCE >> 32; 1687 int64_t mask; 1688 1689 mask = 0xFFFF0000; 1690 int32_t targLev1 = static_cast<int32_t>(targCEshifted & mask); 1691 int32_t patLev1 = static_cast<int32_t>(patCEshifted & mask); 1692 if ( targLev1 != patLev1 ) { 1693 if ( targLev1 == 0 ) { 1694 return U_CE_SKIP_TARG; 1695 } 1696 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 1697 return U_CE_SKIP_PATN; 1698 } 1699 return U_CE_NO_MATCH; 1700 } 1701 1702 mask = 0x0000FFFF; 1703 int32_t targLev2 = static_cast<int32_t>(targCEshifted & mask); 1704 int32_t patLev2 = static_cast<int32_t>(patCEshifted & mask); 1705 if ( targLev2 != patLev2 ) { 1706 if ( targLev2 == 0 ) { 1707 return U_CE_SKIP_TARG; 1708 } 1709 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 1710 return U_CE_SKIP_PATN; 1711 } 1712 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 1713 U_CE_MATCH: U_CE_NO_MATCH; 1714 } 1715 1716 mask = 0xFFFF0000; 1717 int32_t targLev3 = static_cast<int32_t>(targCE & mask); 1718 int32_t patLev3 = static_cast<int32_t>(patCE & mask); 1719 if ( targLev3 != patLev3 ) { 1720 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 1721 U_CE_MATCH: U_CE_NO_MATCH; 1722 } 1723 1724 return U_CE_MATCH; 1725 } 1726 1727 namespace { 1728 1729 UChar32 codePointAt(const USearch &search, int32_t index) { 1730 if (index < search.textLength) { 1731 UChar32 c; 1732 U16_NEXT(search.text, index, search.textLength, c); 1733 return c; 1734 } 1735 return U_SENTINEL; 1736 } 1737 1738 UChar32 codePointBefore(const USearch &search, int32_t index) { 1739 if (0 < index) { 1740 UChar32 c; 1741 U16_PREV(search.text, 0, index, c); 1742 return c; 1743 } 1744 return U_SENTINEL; 1745 } 1746 1747 } // namespace 1748 1749 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 1750 int32_t startIdx, 1751 int32_t *matchStart, 1752 int32_t *matchLimit, 1753 UErrorCode *status) 1754 { 1755 if (U_FAILURE(*status)) { 1756 return false; 1757 } 1758 1759 // TODO: reject search patterns beginning with a combining char. 1760 1761 #ifdef USEARCH_DEBUG 1762 if (getenv("USEARCH_DEBUG") != nullptr) { 1763 printf("Pattern CEs\n"); 1764 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 1765 printf(" %8x", strsrch->pattern.ces[ii]); 1766 } 1767 printf("\n"); 1768 } 1769 1770 #endif 1771 // Input parameter sanity check. 1772 // TODO: should input indices clip to the text length 1773 // in the same way that UText does. 1774 if(strsrch->pattern.cesLength == 0 || 1775 startIdx < 0 || 1776 startIdx > strsrch->search->textLength || 1777 strsrch->pattern.ces == nullptr) { 1778 *status = U_ILLEGAL_ARGUMENT_ERROR; 1779 return false; 1780 } 1781 1782 if (strsrch->pattern.pces == nullptr) { 1783 initializePatternPCETable(strsrch, status); 1784 } 1785 1786 ucol_setOffset(strsrch->textIter, startIdx, status); 1787 CEIBuffer ceb(strsrch, status); 1788 1789 // An out-of-memory (OOM) failure can occur in the initializePatternPCETable function 1790 // or CEIBuffer constructor above, so we need to check the status. 1791 if (U_FAILURE(*status)) { 1792 return false; 1793 } 1794 1795 int32_t targetIx = 0; 1796 const CEI *targetCEI = nullptr; 1797 int32_t patIx; 1798 UBool found; 1799 1800 int32_t mStart = -1; 1801 int32_t mLimit = -1; 1802 int32_t minLimit; 1803 int32_t maxLimit; 1804 1805 1806 1807 // Outer loop moves over match starting positions in the 1808 // target CE space. 1809 // Here we see the target as a sequence of collation elements, resulting from the following: 1810 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 1811 // (for example, digraphs such as IJ may be broken into two characters). 1812 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 1813 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 1814 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 1815 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strength is primary), 1816 // then the CE is deleted, so the following code sees only CEs that are relevant. 1817 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 1818 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 1819 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 1820 // 1821 for(targetIx=0; ; targetIx++) 1822 { 1823 found = true; 1824 // Inner loop checks for a match beginning at each 1825 // position from the outer loop. 1826 int32_t targetIxOffset = 0; 1827 int64_t patCE = 0; 1828 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 1829 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1830 // for this targetIx value, so if it is non-nullptr then other ceb.get calls should be OK. 1831 const CEI *firstCEI = ceb.get(targetIx); 1832 if (firstCEI == nullptr) { 1833 *status = U_INTERNAL_PROGRAM_ERROR; 1834 found = false; 1835 break; 1836 } 1837 1838 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) { 1839 patCE = strsrch->pattern.pces[patIx]; 1840 targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 1841 // Compare CE from target string with CE from the pattern. 1842 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 1843 // which will fail the compare, below. 1844 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 1845 if ( ceMatch == U_CE_NO_MATCH ) { 1846 found = false; 1847 break; 1848 } else if ( ceMatch > U_CE_NO_MATCH ) { 1849 if ( ceMatch == U_CE_SKIP_TARG ) { 1850 // redo with same patCE, next targCE 1851 patIx--; 1852 targetIxOffset++; 1853 } else { // ceMatch == U_CE_SKIP_PATN 1854 // redo with same targCE, next patCE 1855 targetIxOffset--; 1856 } 1857 } 1858 } 1859 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far 1860 1861 if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 1862 // No match at this targetIx. Try again at the next. 1863 continue; 1864 } 1865 1866 if (!found) { 1867 // No match at all, we have run off the end of the target text. 1868 break; 1869 } 1870 1871 1872 // We have found a match in CE space. 1873 // Now determine the bounds in string index space. 1874 // There still is a chance of match failure if the CE range not correspond to 1875 // an acceptable character range. 1876 // 1877 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 1878 1879 mStart = firstCEI->lowIndex; 1880 minLimit = lastCEI->lowIndex; 1881 1882 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1883 // extended to the end of input, and the match is good. 1884 1885 // Look at the high and low indices of the CE following the match. If 1886 // they are the same it means one of two things: 1887 // 1. The match extended to the last CE from the target text, which is OK, or 1888 // 2. The last CE that was part of the match is in an expansion that extends 1889 // to the first CE after the match. In this case, we reject the match. 1890 const CEI* nextCEI = nullptr; 1891 if (strsrch->search->elementComparisonType == 0) { 1892 nextCEI = ceb.get(targetIx + targetIxOffset); 1893 maxLimit = nextCEI->lowIndex; 1894 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 1895 found = false; 1896 } 1897 } else { 1898 for ( ; ; ++targetIxOffset ) { 1899 nextCEI = ceb.get(targetIx + targetIxOffset); 1900 maxLimit = nextCEI->lowIndex; 1901 // If we are at the end of the target too, match succeeds 1902 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 1903 break; 1904 } 1905 // As long as the next CE has primary weight of 0, 1906 // it is part of the last target element matched by the pattern; 1907 // make sure it can be part of a match with the last patCE 1908 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 1909 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 1910 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 1911 found = false; 1912 break; 1913 } 1914 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 1915 // target element, but it has non-zero primary weight => match fails 1916 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 1917 found = false; 1918 break; 1919 // Else the target CE is not part of an expansion of the last matched element, match succeeds 1920 } else { 1921 break; 1922 } 1923 } 1924 } 1925 1926 1927 // Check for the start of the match being within a combining sequence. 1928 // This can happen if the pattern itself begins with a combining char, and 1929 // the match found combining marks in the target text that were attached 1930 // to something else. 1931 // This type of match should be rejected for not completely consuming a 1932 // combining sequence. 1933 if (!isBreakBoundary(strsrch, mStart, *status)) { 1934 found = false; 1935 } 1936 if (U_FAILURE(*status)) { 1937 break; 1938 } 1939 1940 // Check for the start of the match being within an Collation Element Expansion, 1941 // meaning that the first char of the match is only partially matched. 1942 // With expansions, the first CE will report the index of the source 1943 // character, and all subsequent (expansions) CEs will report the source index of the 1944 // _following_ character. 1945 int32_t secondIx = firstCEI->highIndex; 1946 if (mStart == secondIx) { 1947 found = false; 1948 } 1949 1950 // Allow matches to end in the middle of a grapheme cluster if the following 1951 // conditions are met; this is needed to make prefix search work properly in 1952 // Indic, see #11750 1953 // * the default breakIter is being used 1954 // * the next collation element after this combining sequence 1955 // - has non-zero primary weight 1956 // - corresponds to a separate character following the one at end of the current match 1957 // (the second of these conditions, and perhaps both, may be redundant given the 1958 // subsequent check for normalization boundary; however they are likely much faster 1959 // tests in any case) 1960 // * the match limit is a normalization boundary 1961 UBool allowMidclusterMatch = false; 1962 if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) { 1963 allowMidclusterMatch = 1964 strsrch->search->breakIter == nullptr && 1965 nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 1966 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 1967 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 1968 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 1969 } 1970 // If those conditions are met, then: 1971 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1972 // the match limit may be backed off to a previous break boundary. This handles 1973 // cases in which mLimit includes target characters that are ignorable with current 1974 // settings (such as space) and which extend beyond the pattern match. 1975 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1976 // * do NOT require that match limit be on a breakIter boundary 1977 1978 // Advance the match end position to the first acceptable match boundary. 1979 // This advances the index over any combining characters. 1980 mLimit = maxLimit; 1981 if (minLimit < maxLimit) { 1982 // When the last CE's low index is same with its high index, the CE is likely 1983 // a part of expansion. In this case, the index is located just after the 1984 // character corresponding to the CEs compared above. If the index is right 1985 // at the break boundary, move the position to the next boundary will result 1986 // incorrect match length when there are ignorable characters exist between 1987 // the position and the next character produces CE(s). See ticket#8482. 1988 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit, *status)) { 1989 mLimit = minLimit; 1990 } else { 1991 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status); 1992 // Note that we can have nba < maxLimit && nba >= minLImit, in which 1993 // case we want to set mLimit to nba regardless of allowMidclusterMatch 1994 // (i.e. we back off mLimit to the previous breakIterator boundary). 1995 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 1996 mLimit = nba; 1997 } 1998 } 1999 } 2000 2001 if (U_FAILURE(*status)) { 2002 break; 2003 } 2004 2005 #ifdef USEARCH_DEBUG 2006 if (getenv("USEARCH_DEBUG") != nullptr) { 2007 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 2008 } 2009 #endif 2010 2011 if (!allowMidclusterMatch) { 2012 // If advancing to the end of a combining sequence in character indexing space 2013 // advanced us beyond the end of the match in CE space, reject this match. 2014 if (mLimit > maxLimit) { 2015 found = false; 2016 } 2017 2018 if (!isBreakBoundary(strsrch, mLimit, *status)) { 2019 found = false; 2020 } 2021 if (U_FAILURE(*status)) { 2022 break; 2023 } 2024 } 2025 2026 if (! checkIdentical(strsrch, mStart, mLimit)) { 2027 found = false; 2028 } 2029 2030 if (found) { 2031 break; 2032 } 2033 } 2034 2035 #ifdef USEARCH_DEBUG 2036 if (getenv("USEARCH_DEBUG") != nullptr) { 2037 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 2038 int32_t lastToPrint = ceb.limitIx+2; 2039 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 2040 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 2041 } 2042 printf("\n%s\n", found? "match found" : "no match"); 2043 } 2044 #endif 2045 2046 // All Done. Store back the match bounds to the caller. 2047 // 2048 2049 if (U_FAILURE(*status)) { 2050 found = false; // No match if a failure occured. 2051 } 2052 2053 if (found==false) { 2054 mLimit = -1; 2055 mStart = -1; 2056 } 2057 2058 if (matchStart != nullptr) { 2059 *matchStart= mStart; 2060 } 2061 2062 if (matchLimit != nullptr) { 2063 *matchLimit = mLimit; 2064 } 2065 2066 return found; 2067 } 2068 2069 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 2070 int32_t startIdx, 2071 int32_t *matchStart, 2072 int32_t *matchLimit, 2073 UErrorCode *status) 2074 { 2075 if (U_FAILURE(*status)) { 2076 return false; 2077 } 2078 2079 // TODO: reject search patterns beginning with a combining char. 2080 2081 #ifdef USEARCH_DEBUG 2082 if (getenv("USEARCH_DEBUG") != nullptr) { 2083 printf("Pattern CEs\n"); 2084 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) { 2085 printf(" %8x", strsrch->pattern.ces[ii]); 2086 } 2087 printf("\n"); 2088 } 2089 2090 #endif 2091 // Input parameter sanity check. 2092 // TODO: should input indices clip to the text length 2093 // in the same way that UText does. 2094 if(strsrch->pattern.cesLength == 0 || 2095 startIdx < 0 || 2096 startIdx > strsrch->search->textLength || 2097 strsrch->pattern.ces == nullptr) { 2098 *status = U_ILLEGAL_ARGUMENT_ERROR; 2099 return false; 2100 } 2101 2102 if (strsrch->pattern.pces == nullptr) { 2103 initializePatternPCETable(strsrch, status); 2104 } 2105 2106 CEIBuffer ceb(strsrch, status); 2107 int32_t targetIx = 0; 2108 2109 /* 2110 * Pre-load the buffer with the CE's for the grapheme 2111 * after our starting position so that we're sure that 2112 * we can look at the CE following the match when we 2113 * check the match boundaries. 2114 * 2115 * This will also pre-fetch the first CE that we'll 2116 * consider for the match. 2117 */ 2118 if (startIdx < strsrch->search->textLength) { 2119 UBreakIterator *breakiterator = getBreakIterator(strsrch, *status); 2120 if (U_FAILURE(*status)) { 2121 return false; 2122 } 2123 int32_t next = ubrk_following(breakiterator, startIdx); 2124 2125 ucol_setOffset(strsrch->textIter, next, status); 2126 2127 for (targetIx = 0; ; targetIx += 1) { 2128 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 2129 break; 2130 } 2131 } 2132 } else { 2133 ucol_setOffset(strsrch->textIter, startIdx, status); 2134 } 2135 2136 // An out-of-memory (OOM) failure can occur above, so we need to check the status. 2137 if (U_FAILURE(*status)) { 2138 return false; 2139 } 2140 2141 const CEI *targetCEI = nullptr; 2142 int32_t patIx; 2143 UBool found; 2144 2145 int32_t limitIx = targetIx; 2146 int32_t mStart = -1; 2147 int32_t mLimit = -1; 2148 int32_t minLimit; 2149 int32_t maxLimit; 2150 2151 2152 2153 // Outer loop moves over match starting positions in the 2154 // target CE space. 2155 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 2156 // But patIx is 0 at the beginning of the pattern and increases toward the end. 2157 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 2158 // and the beginning of the base text. 2159 for(targetIx = limitIx; ; targetIx += 1) 2160 { 2161 found = true; 2162 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 2163 // (compared to the last CE fetched for the previous targetIx value) as we need to go 2164 // for this targetIx value, so if it is non-nullptr then other ceb.getPrevious calls should be OK. 2165 const CEI *lastCEI = ceb.getPrevious(targetIx); 2166 if (lastCEI == nullptr) { 2167 *status = U_INTERNAL_PROGRAM_ERROR; 2168 found = false; 2169 break; 2170 } 2171 // Inner loop checks for a match beginning at each 2172 // position from the outer loop. 2173 int32_t targetIxOffset = 0; 2174 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) { 2175 int64_t patCE = strsrch->pattern.pces[patIx]; 2176 2177 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset); 2178 // Compare CE from target string with CE from the pattern. 2179 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 2180 // which will fail the compare, below. 2181 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 2182 if ( ceMatch == U_CE_NO_MATCH ) { 2183 found = false; 2184 break; 2185 } else if ( ceMatch > U_CE_NO_MATCH ) { 2186 if ( ceMatch == U_CE_SKIP_TARG ) { 2187 // redo with same patCE, next targCE 2188 patIx++; 2189 targetIxOffset++; 2190 } else { // ceMatch == U_CE_SKIP_PATN 2191 // redo with same targCE, next patCE 2192 targetIxOffset--; 2193 } 2194 } 2195 } 2196 2197 if (!found && ((targetCEI == nullptr) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 2198 // No match at this targetIx. Try again at the next. 2199 continue; 2200 } 2201 2202 if (!found) { 2203 // No match at all, we have run off the end of the target text. 2204 break; 2205 } 2206 2207 2208 // We have found a match in CE space. 2209 // Now determine the bounds in string index space. 2210 // There still is a chance of match failure if the CE range not correspond to 2211 // an acceptable character range. 2212 // 2213 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset); 2214 mStart = firstCEI->lowIndex; 2215 2216 // Check for the start of the match being within a combining sequence. 2217 // This can happen if the pattern itself begins with a combining char, and 2218 // the match found combining marks in the target text that were attached 2219 // to something else. 2220 // This type of match should be rejected for not completely consuming a 2221 // combining sequence. 2222 if (!isBreakBoundary(strsrch, mStart, *status)) { 2223 found = false; 2224 } 2225 if (U_FAILURE(*status)) { 2226 break; 2227 } 2228 2229 // Look at the high index of the first CE in the match. If it's the same as the 2230 // low index, the first CE in the match is in the middle of an expansion. 2231 if (mStart == firstCEI->highIndex) { 2232 found = false; 2233 } 2234 2235 2236 minLimit = lastCEI->lowIndex; 2237 2238 if (targetIx > 0) { 2239 // Look at the CE following the match. If it is UCOL_NULLORDER the match 2240 // extended to the end of input, and the match is good. 2241 2242 // Look at the high and low indices of the CE following the match. If 2243 // they are the same it means one of two things: 2244 // 1. The match extended to the last CE from the target text, which is OK, or 2245 // 2. The last CE that was part of the match is in an expansion that extends 2246 // to the first CE after the match. In this case, we reject the match. 2247 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 2248 2249 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 2250 found = false; 2251 } 2252 2253 mLimit = maxLimit = nextCEI->lowIndex; 2254 2255 // Allow matches to end in the middle of a grapheme cluster if the following 2256 // conditions are met; this is needed to make prefix search work properly in 2257 // Indic, see #11750 2258 // * the default breakIter is being used 2259 // * the next collation element after this combining sequence 2260 // - has non-zero primary weight 2261 // - corresponds to a separate character following the one at end of the current match 2262 // (the second of these conditions, and perhaps both, may be redundant given the 2263 // subsequent check for normalization boundary; however they are likely much faster 2264 // tests in any case) 2265 // * the match limit is a normalization boundary 2266 UBool allowMidclusterMatch = false; 2267 if (strsrch->search->text != nullptr && strsrch->search->textLength > maxLimit) { 2268 allowMidclusterMatch = 2269 strsrch->search->breakIter == nullptr && 2270 nextCEI != nullptr && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 && 2271 maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit && 2272 (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) || 2273 strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit))); 2274 } 2275 // If those conditions are met, then: 2276 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 2277 // the match limit may be backed off to a previous break boundary. This handles 2278 // cases in which mLimit includes target characters that are ignorable with current 2279 // settings (such as space) and which extend beyond the pattern match. 2280 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 2281 // * do NOT require that match limit be on a breakIter boundary 2282 2283 // Advance the match end position to the first acceptable match boundary. 2284 // This advances the index over any combining characters. 2285 if (minLimit < maxLimit) { 2286 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status); 2287 // Note that we can have nba < maxLimit && nba >= minLImit, in which 2288 // case we want to set mLimit to nba regardless of allowMidclusterMatch 2289 // (i.e. we back off mLimit to the previous breakIterator boundary). 2290 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) { 2291 mLimit = nba; 2292 } 2293 } 2294 2295 if (!allowMidclusterMatch) { 2296 // If advancing to the end of a combining sequence in character indexing space 2297 // advanced us beyond the end of the match in CE space, reject this match. 2298 if (mLimit > maxLimit) { 2299 found = false; 2300 } 2301 2302 // Make sure the end of the match is on a break boundary 2303 if (!isBreakBoundary(strsrch, mLimit, *status)) { 2304 found = false; 2305 } 2306 if (U_FAILURE(*status)) { 2307 break; 2308 } 2309 } 2310 2311 } else { 2312 // No non-ignorable CEs after this point. 2313 // The maximum position is detected by boundary after 2314 // the last non-ignorable CE. Combining sequence 2315 // across the start index will be truncated. 2316 int32_t nba = nextBoundaryAfter(strsrch, minLimit, *status); 2317 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 2318 } 2319 2320 #ifdef USEARCH_DEBUG 2321 if (getenv("USEARCH_DEBUG") != nullptr) { 2322 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 2323 } 2324 #endif 2325 2326 2327 if (! checkIdentical(strsrch, mStart, mLimit)) { 2328 found = false; 2329 } 2330 2331 if (found) { 2332 break; 2333 } 2334 } 2335 2336 #ifdef USEARCH_DEBUG 2337 if (getenv("USEARCH_DEBUG") != nullptr) { 2338 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 2339 int32_t lastToPrint = ceb.limitIx+2; 2340 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 2341 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 2342 } 2343 printf("\n%s\n", found? "match found" : "no match"); 2344 } 2345 #endif 2346 2347 // All Done. Store back the match bounds to the caller. 2348 // 2349 2350 if (U_FAILURE(*status)) { 2351 found = false; // No match if a failure occured. 2352 } 2353 2354 if (found==false) { 2355 mLimit = -1; 2356 mStart = -1; 2357 } 2358 2359 if (matchStart != nullptr) { 2360 *matchStart= mStart; 2361 } 2362 2363 if (matchLimit != nullptr) { 2364 *matchLimit = mLimit; 2365 } 2366 2367 return found; 2368 } 2369 2370 // internal use methods declared in usrchimp.h ----------------------------- 2371 2372 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 2373 { 2374 if (U_FAILURE(*status)) { 2375 setMatchNotFound(strsrch, *status); 2376 return false; 2377 } 2378 2379 int32_t textOffset = ucol_getOffset(strsrch->textIter); 2380 int32_t start = -1; 2381 int32_t end = -1; 2382 2383 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 2384 strsrch->search->matchedIndex = start; 2385 strsrch->search->matchedLength = end - start; 2386 return true; 2387 } else { 2388 setMatchNotFound(strsrch, *status); 2389 return false; 2390 } 2391 } 2392 2393 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 2394 { 2395 if (U_FAILURE(*status)) { 2396 setMatchNotFound(strsrch, *status); 2397 return false; 2398 } 2399 2400 int32_t textOffset = ucol_getOffset(strsrch->textIter); 2401 int32_t start = -1; 2402 int32_t end = -1; 2403 2404 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 2405 strsrch->search->matchedIndex = start; 2406 strsrch->search->matchedLength = end - start; 2407 return true; 2408 } else { 2409 setMatchNotFound(strsrch, *status); 2410 return false; 2411 } 2412 } 2413 2414 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 2415 { 2416 if (U_FAILURE(*status)) { 2417 setMatchNotFound(strsrch, *status); 2418 return false; 2419 } 2420 2421 int32_t textOffset; 2422 2423 if (strsrch->search->isOverlap) { 2424 if (strsrch->search->matchedIndex != USEARCH_DONE) { 2425 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 2426 } else { 2427 // move the start position at the end of possible match 2428 initializePatternPCETable(strsrch, status); 2429 if (!initTextProcessedIter(strsrch, status)) { 2430 setMatchNotFound(strsrch, *status); 2431 return false; 2432 } 2433 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 2434 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status); 2435 if (pce == UCOL_PROCESSED_NULLORDER) { 2436 // at the end of the text 2437 break; 2438 } 2439 } 2440 if (U_FAILURE(*status)) { 2441 setMatchNotFound(strsrch, *status); 2442 return false; 2443 } 2444 textOffset = ucol_getOffset(strsrch->textIter); 2445 } 2446 } else { 2447 textOffset = ucol_getOffset(strsrch->textIter); 2448 } 2449 2450 int32_t start = -1; 2451 int32_t end = -1; 2452 2453 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 2454 strsrch->search->matchedIndex = start; 2455 strsrch->search->matchedLength = end - start; 2456 return true; 2457 } else { 2458 setMatchNotFound(strsrch, *status); 2459 return false; 2460 } 2461 } 2462 2463 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 2464 UErrorCode *status) 2465 { 2466 if (U_FAILURE(*status)) { 2467 setMatchNotFound(strsrch, *status); 2468 return false; 2469 } 2470 2471 int32_t textOffset; 2472 2473 if (strsrch->search->isOverlap) { 2474 if (strsrch->search->matchedIndex != USEARCH_DONE) { 2475 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 2476 } else { 2477 // move the start position at the end of possible match 2478 initializePatternPCETable(strsrch, status); 2479 if (!initTextProcessedIter(strsrch, status)) { 2480 setMatchNotFound(strsrch, *status); 2481 return false; 2482 } 2483 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) { 2484 int64_t pce = strsrch->textProcessedIter->nextProcessed(nullptr, nullptr, status); 2485 if (pce == UCOL_PROCESSED_NULLORDER) { 2486 // at the end of the text 2487 break; 2488 } 2489 } 2490 if (U_FAILURE(*status)) { 2491 setMatchNotFound(strsrch, *status); 2492 return false; 2493 } 2494 textOffset = ucol_getOffset(strsrch->textIter); 2495 } 2496 } else { 2497 textOffset = ucol_getOffset(strsrch->textIter); 2498 } 2499 2500 int32_t start = -1; 2501 int32_t end = -1; 2502 2503 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 2504 strsrch->search->matchedIndex = start; 2505 strsrch->search->matchedLength = end - start; 2506 return true; 2507 } else { 2508 setMatchNotFound(strsrch, *status); 2509 return false; 2510 } 2511 } 2512 2513 #endif /* #if !UCONFIG_NO_COLLATION */