unisetspan.cpp (62307B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2007-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: unisetspan.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007mar01 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/ustring.h" 22 #include "unicode/utf8.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "uvector.h" 26 #include "unisetspan.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 * List of offsets from the current position from where to try matching 32 * a code point or a string. 33 * Store offsets rather than indexes to simplify the code and use the same list 34 * for both increments (in span()) and decrements (in spanBack()). 35 * 36 * Assumption: The maximum offset is limited, and the offsets that are stored 37 * at any one time are relatively dense, that is, there are normally no gaps of 38 * hundreds or thousands of offset values. 39 * 40 * The implementation uses a circular buffer of byte flags, 41 * each indicating whether the corresponding offset is in the list. 42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and 43 * physically moving part of the list. 44 * 45 * Note: In principle, the caller should setMaxLength() to the maximum of the 46 * max string length and U16_LENGTH/U8_LENGTH to account for 47 * "long" single code points. 48 * However, this implementation uses at least a staticList with more than 49 * U8_LENGTH entries anyway. 50 * 51 * Note: If maxLength were guaranteed to be no more than 32 or 64, 52 * the list could be stored as bit flags in a single integer. 53 * Rather than handling a circular buffer with a start list index, 54 * the integer would simply be shifted when lower offsets are removed. 55 * UnicodeSet does not have a limit on the lengths of strings. 56 */ 57 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 58 public: 59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 60 61 ~OffsetList() { 62 if(list!=staticList) { 63 uprv_free(list); 64 } 65 } 66 67 // Call exactly once if the list is to be used. 68 void setMaxLength(int32_t maxLength) { 69 if (maxLength <= static_cast<int32_t>(sizeof(staticList))) { 70 capacity = static_cast<int32_t>(sizeof(staticList)); 71 } else { 72 UBool* l = static_cast<UBool*>(uprv_malloc(maxLength)); 73 if(l!=nullptr) { 74 list=l; 75 capacity=maxLength; 76 } 77 } 78 uprv_memset(list, 0, capacity); 79 } 80 81 void clear() { 82 uprv_memset(list, 0, capacity); 83 start=length=0; 84 } 85 86 UBool isEmpty() const { 87 return length == 0; 88 } 89 90 // Reduce all stored offsets by delta, used when the current position 91 // moves by delta. 92 // There must not be any offsets lower than delta. 93 // If there is an offset equal to delta, it is removed. 94 // delta=[1..maxLength] 95 void shift(int32_t delta) { 96 int32_t i=start+delta; 97 if(i>=capacity) { 98 i-=capacity; 99 } 100 if(list[i]) { 101 list[i]=false; 102 --length; 103 } 104 start=i; 105 } 106 107 // Add an offset. The list must not contain it yet. 108 // offset=[1..maxLength] 109 void addOffset(int32_t offset) { 110 int32_t i=start+offset; 111 if(i>=capacity) { 112 i-=capacity; 113 } 114 list[i]=true; 115 ++length; 116 } 117 118 // offset=[1..maxLength] 119 UBool containsOffset(int32_t offset) const { 120 int32_t i=start+offset; 121 if(i>=capacity) { 122 i-=capacity; 123 } 124 return list[i]; 125 } 126 127 // Find the lowest stored offset from a non-empty list, remove it, 128 // and reduce all other offsets by this minimum. 129 // Returns [1..maxLength]. 130 int32_t popMinimum() { 131 // Look for the next offset in list[start+1..capacity-1]. 132 int32_t i=start, result; 133 while(++i<capacity) { 134 if(list[i]) { 135 list[i]=false; 136 --length; 137 result=i-start; 138 start=i; 139 return result; 140 } 141 } 142 // i==capacity 143 144 // Wrap around and look for the next offset in list[0..start]. 145 // Since the list is not empty, there will be one. 146 result=capacity-start; 147 i=0; 148 while(!list[i]) { 149 ++i; 150 } 151 list[i]=false; 152 --length; 153 start=i; 154 return result+=i; 155 } 156 157 private: 158 UBool *list; 159 int32_t capacity; 160 int32_t length; 161 int32_t start; 162 163 UBool staticList[16]; 164 }; 165 166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string. 167 static int32_t 168 getUTF8Length(const char16_t *s, int32_t length) { 169 UErrorCode errorCode=U_ZERO_ERROR; 170 int32_t length8=0; 171 u_strToUTF8(nullptr, 0, &length8, s, length, &errorCode); 172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 173 return length8; 174 } else { 175 // The string contains an unpaired surrogate. 176 // Ignore this string. 177 return 0; 178 } 179 } 180 181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length. 182 static int32_t 183 appendUTF8(const char16_t *s, int32_t length, uint8_t *t, int32_t capacity) { 184 UErrorCode errorCode=U_ZERO_ERROR; 185 int32_t length8=0; 186 u_strToUTF8(reinterpret_cast<char*>(t), capacity, &length8, s, length, &errorCode); 187 if(U_SUCCESS(errorCode)) { 188 return length8; 189 } else { 190 // The string contains an unpaired surrogate. 191 // Ignore this string. 192 return 0; 193 } 194 } 195 196 static inline uint8_t 197 makeSpanLengthByte(int32_t spanLength) { 198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 199 return spanLength < 0xfe ? static_cast<uint8_t>(spanLength) : static_cast<uint8_t>(0xfe); 200 } 201 202 // Construct for all variants of span(), or only for any one variant. 203 // Initialize as little as possible, for single use. 204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 205 const UVector &setStrings, 206 uint32_t which) 207 : spanSet(0, 0x10ffff), pSpanNotSet(nullptr), strings(setStrings), 208 utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr), 209 utf8Length(0), 210 maxLength16(0), maxLength8(0), 211 all(static_cast<UBool>(which == ALL)) { 212 spanSet.retainAll(set); 213 if(which&NOT_CONTAINED) { 214 // Default to the same sets. 215 // addToSpanNotSet() will create a separate set if necessary. 216 pSpanNotSet=&spanSet; 217 } 218 219 // Determine if the strings even need to be taken into account at all for span() etc. 220 // If any string is relevant, then all strings need to be used for 221 // span(longest match) but only the relevant ones for span(while contained). 222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 226 int32_t stringsLength=strings.size(); 227 228 int32_t i, spanLength; 229 UBool someRelevant=false; 230 for(i=0; i<stringsLength; ++i) { 231 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 232 const char16_t *s16=string.getBuffer(); 233 int32_t length16=string.length(); 234 if (length16==0) { 235 continue; // skip the empty string 236 } 237 UBool thisRelevant; 238 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 239 if(spanLength<length16) { // Relevant string. 240 someRelevant=thisRelevant=true; 241 } else { 242 thisRelevant=false; 243 } 244 if((which&UTF16) && length16>maxLength16) { 245 maxLength16=length16; 246 } 247 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 248 int32_t length8=getUTF8Length(s16, length16); 249 utf8Length+=length8; 250 if(length8>maxLength8) { 251 maxLength8=length8; 252 } 253 } 254 } 255 if(!someRelevant) { 256 maxLength16=maxLength8=0; 257 return; 258 } 259 260 // Freeze after checking for the need to use strings at all because freezing 261 // a set takes some time and memory which are wasted if there are no relevant strings. 262 if(all) { 263 spanSet.freeze(); 264 } 265 266 uint8_t *spanBackLengths; 267 uint8_t *spanUTF8Lengths; 268 uint8_t *spanBackUTF8Lengths; 269 270 // Allocate a block of meta data. 271 int32_t allocSize; 272 if(all) { 273 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 274 allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 275 } else { 276 allocSize=stringsLength; // One set of span lengths. 277 if(which&UTF8) { 278 // UTF-8 lengths and UTF-8 strings. 279 allocSize+=stringsLength*4+utf8Length; 280 } 281 } 282 if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) { 283 utf8Lengths=staticLengths; 284 } else { 285 utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize)); 286 if(utf8Lengths==nullptr) { 287 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false. 288 return; // Out of memory. 289 } 290 } 291 292 if(all) { 293 // Store span lengths for all span() variants. 294 spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength); 295 spanBackLengths=spanLengths+stringsLength; 296 spanUTF8Lengths=spanBackLengths+stringsLength; 297 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 298 utf8=spanBackUTF8Lengths+stringsLength; 299 } else { 300 // Store span lengths for only one span() variant. 301 if(which&UTF8) { 302 spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength); 303 utf8=spanLengths+stringsLength; 304 } else { 305 spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths); 306 } 307 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 308 } 309 310 // Set the meta data and pSpanNotSet and write the UTF-8 strings. 311 int32_t utf8Count=0; // Count UTF-8 bytes written so far. 312 313 for(i=0; i<stringsLength; ++i) { 314 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 315 const char16_t *s16=string.getBuffer(); 316 int32_t length16=string.length(); 317 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 318 if(spanLength<length16 && length16>0) { // Relevant string. 319 if(which&UTF16) { 320 if(which&CONTAINED) { 321 if(which&FWD) { 322 spanLengths[i]=makeSpanLengthByte(spanLength); 323 } 324 if(which&BACK) { 325 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 326 spanBackLengths[i]=makeSpanLengthByte(spanLength); 327 } 328 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 329 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 330 } 331 } 332 if(which&UTF8) { 333 uint8_t *s8=utf8+utf8Count; 334 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 335 utf8Count+=utf8Lengths[i]=length8; 336 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 337 spanUTF8Lengths[i] = spanBackUTF8Lengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED); 338 } else { // Relevant for UTF-8. 339 if(which&CONTAINED) { 340 if(which&FWD) { 341 spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED); 342 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 343 } 344 if(which&BACK) { 345 spanLength = length8 - spanSet.spanBackUTF8(reinterpret_cast<const char*>(s8), length8, USET_SPAN_CONTAINED); 346 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 347 } 348 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 349 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 350 } 351 } 352 } 353 if(which&NOT_CONTAINED) { 354 // Add string start and end code points to the spanNotSet so that 355 // a span(while not contained) stops before any string. 356 UChar32 c; 357 if(which&FWD) { 358 int32_t len=0; 359 U16_NEXT(s16, len, length16, c); 360 addToSpanNotSet(c); 361 } 362 if(which&BACK) { 363 int32_t len=length16; 364 U16_PREV(s16, 0, len, c); 365 addToSpanNotSet(c); 366 } 367 } 368 } else { // Irrelevant string. (Also the empty string.) 369 if(which&UTF8) { 370 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 371 uint8_t *s8=utf8+utf8Count; 372 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 373 utf8Count+=utf8Lengths[i]=length8; 374 } else { 375 utf8Lengths[i]=0; 376 } 377 } 378 if(all) { 379 spanLengths[i]=spanBackLengths[i]= 380 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 381 static_cast<uint8_t>(ALL_CP_CONTAINED); 382 } else { 383 // All spanXYZLengths pointers contain the same address. 384 spanLengths[i] = static_cast<uint8_t>(ALL_CP_CONTAINED); 385 } 386 } 387 } 388 389 // Finish. 390 if(all) { 391 pSpanNotSet->freeze(); 392 } 393 } 394 395 // Copy constructor. Assumes which==ALL for a frozen set. 396 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 397 const UVector &newParentSetStrings) 398 : spanSet(otherStringSpan.spanSet), pSpanNotSet(nullptr), strings(newParentSetStrings), 399 utf8Lengths(nullptr), spanLengths(nullptr), utf8(nullptr), 400 utf8Length(otherStringSpan.utf8Length), 401 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 402 all(true) { 403 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 404 pSpanNotSet=&spanSet; 405 } else { 406 pSpanNotSet=otherStringSpan.pSpanNotSet->clone(); 407 } 408 409 // Allocate a block of meta data. 410 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 411 int32_t stringsLength=strings.size(); 412 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 413 if (allocSize <= static_cast<int32_t>(sizeof(staticLengths))) { 414 utf8Lengths=staticLengths; 415 } else { 416 utf8Lengths = static_cast<int32_t*>(uprv_malloc(allocSize)); 417 if(utf8Lengths==nullptr) { 418 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return false. 419 return; // Out of memory. 420 } 421 } 422 423 spanLengths = reinterpret_cast<uint8_t*>(utf8Lengths + stringsLength); 424 utf8=spanLengths+stringsLength*4; 425 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 426 } 427 428 UnicodeSetStringSpan::~UnicodeSetStringSpan() { 429 if(pSpanNotSet!=nullptr && pSpanNotSet!=&spanSet) { 430 delete pSpanNotSet; 431 } 432 if(utf8Lengths!=nullptr && utf8Lengths!=staticLengths) { 433 uprv_free(utf8Lengths); 434 } 435 } 436 437 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 438 if(pSpanNotSet==nullptr || pSpanNotSet==&spanSet) { 439 if(spanSet.contains(c)) { 440 return; // Nothing to do. 441 } 442 UnicodeSet *newSet=spanSet.cloneAsThawed(); 443 if(newSet==nullptr) { 444 return; // Out of memory. 445 } else { 446 pSpanNotSet=newSet; 447 } 448 } 449 pSpanNotSet->add(c); 450 } 451 452 // Compare strings without any argument checks. Requires length>0. 453 static inline UBool 454 matches16(const char16_t *s, const char16_t *t, int32_t length) { 455 do { 456 if(*s++!=*t++) { 457 return false; 458 } 459 } while(--length>0); 460 return true; 461 } 462 463 static inline UBool 464 matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 465 do { 466 if(*s++!=*t++) { 467 return false; 468 } 469 } while(--length>0); 470 return true; 471 } 472 473 // Compare 16-bit Unicode strings (which may be malformed UTF-16) 474 // at code point boundaries. 475 // That is, each edge of a match must not be in the middle of a surrogate pair. 476 static inline UBool 477 matches16CPB(const char16_t *s, int32_t start, int32_t limit, const char16_t *t, int32_t length) { 478 s+=start; 479 limit-=start; 480 return matches16(s, t, length) && 481 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 482 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 483 } 484 485 // Does the set contain the next code point? 486 // If so, return its length; otherwise return its negative length. 487 static inline int32_t 488 spanOne(const UnicodeSet &set, const char16_t *s, int32_t length) { 489 char16_t c=*s, c2; 490 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 491 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 492 } 493 return set.contains(c) ? 1 : -1; 494 } 495 496 static inline int32_t 497 spanOneBack(const UnicodeSet &set, const char16_t *s, int32_t length) { 498 char16_t c=s[length-1], c2; 499 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 500 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 501 } 502 return set.contains(c) ? 1 : -1; 503 } 504 505 static inline int32_t 506 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 507 UChar32 c=*s; 508 if(U8_IS_SINGLE(c)) { 509 return set.contains(c) ? 1 : -1; 510 } 511 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 512 int32_t i=0; 513 U8_NEXT_OR_FFFD(s, i, length, c); 514 return set.contains(c) ? i : -i; 515 } 516 517 static inline int32_t 518 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 519 UChar32 c=s[length-1]; 520 if(U8_IS_SINGLE(c)) { 521 return set.contains(c) ? 1 : -1; 522 } 523 int32_t i=length-1; 524 c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 525 length-=i; 526 return set.contains(c) ? length : -length; 527 } 528 529 /* 530 * Note: In span() when spanLength==0 (after a string match, or at the beginning 531 * after an empty code point span) and in spanNot() and spanNotUTF8(), 532 * string matching could use a binary search 533 * because all string matches are done from the same start index. 534 * 535 * For UTF-8, this would require a comparison function that returns UTF-16 order. 536 * 537 * This optimization should not be necessary for normal UnicodeSets because 538 * most sets have no strings, and most sets with strings have 539 * very few very short strings. 540 * For cases with many strings, it might be better to use a different API 541 * and implementation with a DFA (state machine). 542 */ 543 544 /* 545 * Algorithm for span(USET_SPAN_CONTAINED) 546 * 547 * Theoretical algorithm: 548 * - Iterate through the string, and at each code point boundary: 549 * + If the code point there is in the set, then remember to continue after it. 550 * + If a set string matches at the current position, then remember to continue after it. 551 * + Either recursively span for each code point or string match, 552 * or recursively span for all but the shortest one and 553 * iteratively continue the span with the shortest local match. 554 * + Remember the longest recursive span (the farthest end point). 555 * + If there is no match at the current position, neither for the code point there 556 * nor for any set string, then stop and return the longest recursive span length. 557 * 558 * Optimized implementation: 559 * 560 * (We assume that most sets will have very few very short strings. 561 * A span using a string-less set is extremely fast.) 562 * 563 * Create and cache a spanSet which contains all of the single code points 564 * of the original set but none of its strings. 565 * 566 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 567 * - Loop: 568 * + Try to match each set string at the end of the spanLength. 569 * ~ Set strings that start with set-contained code points must be matched 570 * with a partial overlap because the recursive algorithm would have tried 571 * to match them at every position. 572 * ~ Set strings that entirely consist of set-contained code points 573 * are irrelevant for span(USET_SPAN_CONTAINED) because the 574 * recursive algorithm would continue after them anyway 575 * and find the longest recursive match from their end. 576 * ~ Rather than recursing, note each end point of a set string match. 577 * + If no set string matched after spanSet.span(), then return 578 * with where the spanSet.span() ended. 579 * + If at least one set string matched after spanSet.span(), then 580 * pop the shortest string match end point and continue 581 * the loop, trying to match all set strings from there. 582 * + If at least one more set string matched after a previous string match, 583 * then test if the code point after the previous string match is also 584 * contained in the set. 585 * Continue the loop with the shortest end point of either this code point 586 * or a matching set string. 587 * + If no more set string matched after a previous string match, 588 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 589 * Stop if spanLength==0, otherwise continue the loop. 590 * 591 * By noting each end point of a set string match, 592 * the function visits each string position at most once and finishes 593 * in linear time. 594 * 595 * The recursive algorithm may visit the same string position many times 596 * if multiple paths lead to it and finishes in exponential time. 597 */ 598 599 /* 600 * Algorithm for span(USET_SPAN_SIMPLE) 601 * 602 * Theoretical algorithm: 603 * - Iterate through the string, and at each code point boundary: 604 * + If the code point there is in the set, then remember to continue after it. 605 * + If a set string matches at the current position, then remember to continue after it. 606 * + Continue from the farthest match position and ignore all others. 607 * + If there is no match at the current position, 608 * then stop and return the current position. 609 * 610 * Optimized implementation: 611 * 612 * (Same assumption and spanSet as above.) 613 * 614 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 615 * - Loop: 616 * + Try to match each set string at the end of the spanLength. 617 * ~ Set strings that start with set-contained code points must be matched 618 * with a partial overlap because the standard algorithm would have tried 619 * to match them earlier. 620 * ~ Set strings that entirely consist of set-contained code points 621 * must be matched with a full overlap because the longest-match algorithm 622 * would hide set string matches that end earlier. 623 * Such set strings need not be matched earlier inside the code point span 624 * because the standard algorithm would then have continued after 625 * the set string match anyway. 626 * ~ Remember the longest set string match (farthest end point) from the earliest 627 * starting point. 628 * + If no set string matched after spanSet.span(), then return 629 * with where the spanSet.span() ended. 630 * + If at least one set string matched, then continue the loop after the 631 * longest match from the earliest position. 632 * + If no more set string matched after a previous string match, 633 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 634 * Stop if spanLength==0, otherwise continue the loop. 635 */ 636 637 int32_t UnicodeSetStringSpan::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { 638 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 639 return spanNot(s, length); 640 } 641 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 642 if(spanLength==length) { 643 return length; 644 } 645 646 // Consider strings; they may overlap with the span. 647 OffsetList offsets; 648 if(spanCondition==USET_SPAN_CONTAINED) { 649 // Use offset list to try all possibilities. 650 offsets.setMaxLength(maxLength16); 651 } 652 int32_t pos=spanLength, rest=length-pos; 653 int32_t i, stringsLength=strings.size(); 654 for(;;) { 655 if(spanCondition==USET_SPAN_CONTAINED) { 656 for(i=0; i<stringsLength; ++i) { 657 int32_t overlap=spanLengths[i]; 658 if(overlap==ALL_CP_CONTAINED) { 659 continue; // Irrelevant string. (Also the empty string.) 660 } 661 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 662 const char16_t *s16=string.getBuffer(); 663 int32_t length16=string.length(); 664 U_ASSERT(length>0); 665 666 // Try to match this string at pos-overlap..pos. 667 if(overlap>=LONG_SPAN) { 668 overlap=length16; 669 // While contained: No point matching fully inside the code point span. 670 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 671 } 672 if(overlap>spanLength) { 673 overlap=spanLength; 674 } 675 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 676 for(;;) { 677 if(inc>rest) { 678 break; 679 } 680 // Try to match if the increment is not listed already. 681 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 682 if(inc==rest) { 683 return length; // Reached the end of the string. 684 } 685 offsets.addOffset(inc); 686 } 687 if(overlap==0) { 688 break; 689 } 690 --overlap; 691 ++inc; 692 } 693 } 694 } else /* USET_SPAN_SIMPLE */ { 695 int32_t maxInc=0, maxOverlap=0; 696 for(i=0; i<stringsLength; ++i) { 697 int32_t overlap=spanLengths[i]; 698 // For longest match, we do need to try to match even an all-contained string 699 // to find the match from the earliest start. 700 701 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 702 const char16_t *s16=string.getBuffer(); 703 int32_t length16=string.length(); 704 if (length16==0) { 705 continue; // skip the empty string 706 } 707 708 // Try to match this string at pos-overlap..pos. 709 if(overlap>=LONG_SPAN) { 710 overlap=length16; 711 // Longest match: Need to match fully inside the code point span 712 // to find the match from the earliest start. 713 } 714 if(overlap>spanLength) { 715 overlap=spanLength; 716 } 717 int32_t inc=length16-overlap; // Keep overlap+inc==length16. 718 for(;;) { 719 if(inc>rest || overlap<maxOverlap) { 720 break; 721 } 722 // Try to match if the string is longer or starts earlier. 723 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 724 matches16CPB(s, pos-overlap, length, s16, length16) 725 ) { 726 maxInc=inc; // Longest match from earliest start. 727 maxOverlap=overlap; 728 break; 729 } 730 --overlap; 731 ++inc; 732 } 733 } 734 735 if(maxInc!=0 || maxOverlap!=0) { 736 // Longest-match algorithm, and there was a string match. 737 // Simply continue after it. 738 pos+=maxInc; 739 rest-=maxInc; 740 if(rest==0) { 741 return length; // Reached the end of the string. 742 } 743 spanLength=0; // Match strings from after a string match. 744 continue; 745 } 746 } 747 // Finished trying to match all strings at pos. 748 749 if(spanLength!=0 || pos==0) { 750 // The position is after an unlimited code point span (spanLength!=0), 751 // not after a string match. 752 // The only position where spanLength==0 after a span is pos==0. 753 // Otherwise, an unlimited code point span is only tried again when no 754 // strings match, and if such a non-initial span fails we stop. 755 if(offsets.isEmpty()) { 756 return pos; // No strings matched after a span. 757 } 758 // Match strings from after the next string match. 759 } else { 760 // The position is after a string match (or a single code point). 761 if(offsets.isEmpty()) { 762 // No more strings matched after a previous string match. 763 // Try another code point span from after the last string match. 764 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 765 if( spanLength==rest || // Reached the end of the string, or 766 spanLength==0 // neither strings nor span progressed. 767 ) { 768 return pos+spanLength; 769 } 770 pos+=spanLength; 771 rest-=spanLength; 772 continue; // spanLength>0: Match strings from after a span. 773 } else { 774 // Try to match only one code point from after a string match if some 775 // string matched beyond it, so that we try all possible positions 776 // and don't overshoot. 777 spanLength=spanOne(spanSet, s+pos, rest); 778 if(spanLength>0) { 779 if(spanLength==rest) { 780 return length; // Reached the end of the string. 781 } 782 // Match strings after this code point. 783 // There cannot be any increments below it because UnicodeSet strings 784 // contain multiple code points. 785 pos+=spanLength; 786 rest-=spanLength; 787 offsets.shift(spanLength); 788 spanLength=0; 789 continue; // Match strings from after a single code point. 790 } 791 // Match strings from after the next string match. 792 } 793 } 794 int32_t minOffset=offsets.popMinimum(); 795 pos+=minOffset; 796 rest-=minOffset; 797 spanLength=0; // Match strings from after a string match. 798 } 799 } 800 801 int32_t UnicodeSetStringSpan::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const { 802 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 803 return spanNotBack(s, length); 804 } 805 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 806 if(pos==0) { 807 return 0; 808 } 809 int32_t spanLength=length-pos; 810 811 // Consider strings; they may overlap with the span. 812 OffsetList offsets; 813 if(spanCondition==USET_SPAN_CONTAINED) { 814 // Use offset list to try all possibilities. 815 offsets.setMaxLength(maxLength16); 816 } 817 int32_t i, stringsLength=strings.size(); 818 uint8_t *spanBackLengths=spanLengths; 819 if(all) { 820 spanBackLengths+=stringsLength; 821 } 822 for(;;) { 823 if(spanCondition==USET_SPAN_CONTAINED) { 824 for(i=0; i<stringsLength; ++i) { 825 int32_t overlap=spanBackLengths[i]; 826 if(overlap==ALL_CP_CONTAINED) { 827 continue; // Irrelevant string. (Also the empty string.) 828 } 829 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 830 const char16_t *s16=string.getBuffer(); 831 int32_t length16=string.length(); 832 U_ASSERT(length>0); 833 834 // Try to match this string at pos-(length16-overlap)..pos-length16. 835 if(overlap>=LONG_SPAN) { 836 overlap=length16; 837 // While contained: No point matching fully inside the code point span. 838 int32_t len1=0; 839 U16_FWD_1(s16, len1, overlap); 840 overlap-=len1; // Length of the string minus the first code point. 841 } 842 if(overlap>spanLength) { 843 overlap=spanLength; 844 } 845 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 846 for(;;) { 847 if(dec>pos) { 848 break; 849 } 850 // Try to match if the decrement is not listed already. 851 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 852 if(dec==pos) { 853 return 0; // Reached the start of the string. 854 } 855 offsets.addOffset(dec); 856 } 857 if(overlap==0) { 858 break; 859 } 860 --overlap; 861 ++dec; 862 } 863 } 864 } else /* USET_SPAN_SIMPLE */ { 865 int32_t maxDec=0, maxOverlap=0; 866 for(i=0; i<stringsLength; ++i) { 867 int32_t overlap=spanBackLengths[i]; 868 // For longest match, we do need to try to match even an all-contained string 869 // to find the match from the latest end. 870 871 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 872 const char16_t *s16=string.getBuffer(); 873 int32_t length16=string.length(); 874 if (length16==0) { 875 continue; // skip the empty string 876 } 877 878 // Try to match this string at pos-(length16-overlap)..pos-length16. 879 if(overlap>=LONG_SPAN) { 880 overlap=length16; 881 // Longest match: Need to match fully inside the code point span 882 // to find the match from the latest end. 883 } 884 if(overlap>spanLength) { 885 overlap=spanLength; 886 } 887 int32_t dec=length16-overlap; // Keep dec+overlap==length16. 888 for(;;) { 889 if(dec>pos || overlap<maxOverlap) { 890 break; 891 } 892 // Try to match if the string is longer or ends later. 893 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 894 matches16CPB(s, pos-dec, length, s16, length16) 895 ) { 896 maxDec=dec; // Longest match from latest end. 897 maxOverlap=overlap; 898 break; 899 } 900 --overlap; 901 ++dec; 902 } 903 } 904 905 if(maxDec!=0 || maxOverlap!=0) { 906 // Longest-match algorithm, and there was a string match. 907 // Simply continue before it. 908 pos-=maxDec; 909 if(pos==0) { 910 return 0; // Reached the start of the string. 911 } 912 spanLength=0; // Match strings from before a string match. 913 continue; 914 } 915 } 916 // Finished trying to match all strings at pos. 917 918 if(spanLength!=0 || pos==length) { 919 // The position is before an unlimited code point span (spanLength!=0), 920 // not before a string match. 921 // The only position where spanLength==0 before a span is pos==length. 922 // Otherwise, an unlimited code point span is only tried again when no 923 // strings match, and if such a non-initial span fails we stop. 924 if(offsets.isEmpty()) { 925 return pos; // No strings matched before a span. 926 } 927 // Match strings from before the next string match. 928 } else { 929 // The position is before a string match (or a single code point). 930 if(offsets.isEmpty()) { 931 // No more strings matched before a previous string match. 932 // Try another code point span from before the last string match. 933 int32_t oldPos=pos; 934 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 935 spanLength=oldPos-pos; 936 if( pos==0 || // Reached the start of the string, or 937 spanLength==0 // neither strings nor span progressed. 938 ) { 939 return pos; 940 } 941 continue; // spanLength>0: Match strings from before a span. 942 } else { 943 // Try to match only one code point from before a string match if some 944 // string matched beyond it, so that we try all possible positions 945 // and don't overshoot. 946 spanLength=spanOneBack(spanSet, s, pos); 947 if(spanLength>0) { 948 if(spanLength==pos) { 949 return 0; // Reached the start of the string. 950 } 951 // Match strings before this code point. 952 // There cannot be any decrements below it because UnicodeSet strings 953 // contain multiple code points. 954 pos-=spanLength; 955 offsets.shift(spanLength); 956 spanLength=0; 957 continue; // Match strings from before a single code point. 958 } 959 // Match strings from before the next string match. 960 } 961 } 962 pos-=offsets.popMinimum(); 963 spanLength=0; // Match strings from before a string match. 964 } 965 } 966 967 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 968 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 969 return spanNotUTF8(s, length); 970 } 971 int32_t spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED); 972 if(spanLength==length) { 973 return length; 974 } 975 976 // Consider strings; they may overlap with the span. 977 OffsetList offsets; 978 if(spanCondition==USET_SPAN_CONTAINED) { 979 // Use offset list to try all possibilities. 980 offsets.setMaxLength(maxLength8); 981 } 982 int32_t pos=spanLength, rest=length-pos; 983 int32_t i, stringsLength=strings.size(); 984 uint8_t *spanUTF8Lengths=spanLengths; 985 if(all) { 986 spanUTF8Lengths+=2*stringsLength; 987 } 988 for(;;) { 989 const uint8_t *s8=utf8; 990 int32_t length8; 991 if(spanCondition==USET_SPAN_CONTAINED) { 992 for(i=0; i<stringsLength; ++i) { 993 length8=utf8Lengths[i]; 994 if(length8==0) { 995 continue; // String not representable in UTF-8. 996 } 997 int32_t overlap=spanUTF8Lengths[i]; 998 if(overlap==ALL_CP_CONTAINED) { 999 s8+=length8; 1000 continue; // Irrelevant string. 1001 } 1002 1003 // Try to match this string at pos-overlap..pos. 1004 if(overlap>=LONG_SPAN) { 1005 overlap=length8; 1006 // While contained: No point matching fully inside the code point span. 1007 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 1008 } 1009 if(overlap>spanLength) { 1010 overlap=spanLength; 1011 } 1012 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1013 for(;;) { 1014 if(inc>rest) { 1015 break; 1016 } 1017 // Try to match if the increment is not listed already. 1018 // Match at code point boundaries. (The UTF-8 strings were converted 1019 // from UTF-16 and are guaranteed to be well-formed.) 1020 if(!U8_IS_TRAIL(s[pos-overlap]) && 1021 !offsets.containsOffset(inc) && 1022 matches8(s+pos-overlap, s8, length8)) { 1023 if(inc==rest) { 1024 return length; // Reached the end of the string. 1025 } 1026 offsets.addOffset(inc); 1027 } 1028 if(overlap==0) { 1029 break; 1030 } 1031 --overlap; 1032 ++inc; 1033 } 1034 s8+=length8; 1035 } 1036 } else /* USET_SPAN_SIMPLE */ { 1037 int32_t maxInc=0, maxOverlap=0; 1038 for(i=0; i<stringsLength; ++i) { 1039 length8=utf8Lengths[i]; 1040 if(length8==0) { 1041 continue; // String not representable in UTF-8. 1042 } 1043 int32_t overlap=spanUTF8Lengths[i]; 1044 // For longest match, we do need to try to match even an all-contained string 1045 // to find the match from the earliest start. 1046 1047 // Try to match this string at pos-overlap..pos. 1048 if(overlap>=LONG_SPAN) { 1049 overlap=length8; 1050 // Longest match: Need to match fully inside the code point span 1051 // to find the match from the earliest start. 1052 } 1053 if(overlap>spanLength) { 1054 overlap=spanLength; 1055 } 1056 int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1057 for(;;) { 1058 if(inc>rest || overlap<maxOverlap) { 1059 break; 1060 } 1061 // Try to match if the string is longer or starts earlier. 1062 // Match at code point boundaries. (The UTF-8 strings were converted 1063 // from UTF-16 and are guaranteed to be well-formed.) 1064 if(!U8_IS_TRAIL(s[pos-overlap]) && 1065 (overlap>maxOverlap || 1066 /* redundant overlap==maxOverlap && */ inc>maxInc) && 1067 matches8(s+pos-overlap, s8, length8)) { 1068 maxInc=inc; // Longest match from earliest start. 1069 maxOverlap=overlap; 1070 break; 1071 } 1072 --overlap; 1073 ++inc; 1074 } 1075 s8+=length8; 1076 } 1077 1078 if(maxInc!=0 || maxOverlap!=0) { 1079 // Longest-match algorithm, and there was a string match. 1080 // Simply continue after it. 1081 pos+=maxInc; 1082 rest-=maxInc; 1083 if(rest==0) { 1084 return length; // Reached the end of the string. 1085 } 1086 spanLength=0; // Match strings from after a string match. 1087 continue; 1088 } 1089 } 1090 // Finished trying to match all strings at pos. 1091 1092 if(spanLength!=0 || pos==0) { 1093 // The position is after an unlimited code point span (spanLength!=0), 1094 // not after a string match. 1095 // The only position where spanLength==0 after a span is pos==0. 1096 // Otherwise, an unlimited code point span is only tried again when no 1097 // strings match, and if such a non-initial span fails we stop. 1098 if(offsets.isEmpty()) { 1099 return pos; // No strings matched after a span. 1100 } 1101 // Match strings from after the next string match. 1102 } else { 1103 // The position is after a string match (or a single code point). 1104 if(offsets.isEmpty()) { 1105 // No more strings matched after a previous string match. 1106 // Try another code point span from after the last string match. 1107 spanLength = spanSet.spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_CONTAINED); 1108 if( spanLength==rest || // Reached the end of the string, or 1109 spanLength==0 // neither strings nor span progressed. 1110 ) { 1111 return pos+spanLength; 1112 } 1113 pos+=spanLength; 1114 rest-=spanLength; 1115 continue; // spanLength>0: Match strings from after a span. 1116 } else { 1117 // Try to match only one code point from after a string match if some 1118 // string matched beyond it, so that we try all possible positions 1119 // and don't overshoot. 1120 spanLength=spanOneUTF8(spanSet, s+pos, rest); 1121 if(spanLength>0) { 1122 if(spanLength==rest) { 1123 return length; // Reached the end of the string. 1124 } 1125 // Match strings after this code point. 1126 // There cannot be any increments below it because UnicodeSet strings 1127 // contain multiple code points. 1128 pos+=spanLength; 1129 rest-=spanLength; 1130 offsets.shift(spanLength); 1131 spanLength=0; 1132 continue; // Match strings from after a single code point. 1133 } 1134 // Match strings from after the next string match. 1135 } 1136 } 1137 int32_t minOffset=offsets.popMinimum(); 1138 pos+=minOffset; 1139 rest-=minOffset; 1140 spanLength=0; // Match strings from after a string match. 1141 } 1142 } 1143 1144 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1145 if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1146 return spanNotBackUTF8(s, length); 1147 } 1148 int32_t pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), length, USET_SPAN_CONTAINED); 1149 if(pos==0) { 1150 return 0; 1151 } 1152 int32_t spanLength=length-pos; 1153 1154 // Consider strings; they may overlap with the span. 1155 OffsetList offsets; 1156 if(spanCondition==USET_SPAN_CONTAINED) { 1157 // Use offset list to try all possibilities. 1158 offsets.setMaxLength(maxLength8); 1159 } 1160 int32_t i, stringsLength=strings.size(); 1161 uint8_t *spanBackUTF8Lengths=spanLengths; 1162 if(all) { 1163 spanBackUTF8Lengths+=3*stringsLength; 1164 } 1165 for(;;) { 1166 const uint8_t *s8=utf8; 1167 int32_t length8; 1168 if(spanCondition==USET_SPAN_CONTAINED) { 1169 for(i=0; i<stringsLength; ++i) { 1170 length8=utf8Lengths[i]; 1171 if(length8==0) { 1172 continue; // String not representable in UTF-8. 1173 } 1174 int32_t overlap=spanBackUTF8Lengths[i]; 1175 if(overlap==ALL_CP_CONTAINED) { 1176 s8+=length8; 1177 continue; // Irrelevant string. 1178 } 1179 1180 // Try to match this string at pos-(length8-overlap)..pos-length8. 1181 if(overlap>=LONG_SPAN) { 1182 overlap=length8; 1183 // While contained: No point matching fully inside the code point span. 1184 int32_t len1=0; 1185 U8_FWD_1(s8, len1, overlap); 1186 overlap-=len1; // Length of the string minus the first code point. 1187 } 1188 if(overlap>spanLength) { 1189 overlap=spanLength; 1190 } 1191 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1192 for(;;) { 1193 if(dec>pos) { 1194 break; 1195 } 1196 // Try to match if the decrement is not listed already. 1197 // Match at code point boundaries. (The UTF-8 strings were converted 1198 // from UTF-16 and are guaranteed to be well-formed.) 1199 if( !U8_IS_TRAIL(s[pos-dec]) && 1200 !offsets.containsOffset(dec) && 1201 matches8(s+pos-dec, s8, length8) 1202 ) { 1203 if(dec==pos) { 1204 return 0; // Reached the start of the string. 1205 } 1206 offsets.addOffset(dec); 1207 } 1208 if(overlap==0) { 1209 break; 1210 } 1211 --overlap; 1212 ++dec; 1213 } 1214 s8+=length8; 1215 } 1216 } else /* USET_SPAN_SIMPLE */ { 1217 int32_t maxDec=0, maxOverlap=0; 1218 for(i=0; i<stringsLength; ++i) { 1219 length8=utf8Lengths[i]; 1220 if(length8==0) { 1221 continue; // String not representable in UTF-8. 1222 } 1223 int32_t overlap=spanBackUTF8Lengths[i]; 1224 // For longest match, we do need to try to match even an all-contained string 1225 // to find the match from the latest end. 1226 1227 // Try to match this string at pos-(length8-overlap)..pos-length8. 1228 if(overlap>=LONG_SPAN) { 1229 overlap=length8; 1230 // Longest match: Need to match fully inside the code point span 1231 // to find the match from the latest end. 1232 } 1233 if(overlap>spanLength) { 1234 overlap=spanLength; 1235 } 1236 int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1237 for(;;) { 1238 if(dec>pos || overlap<maxOverlap) { 1239 break; 1240 } 1241 // Try to match if the string is longer or ends later. 1242 // Match at code point boundaries. (The UTF-8 strings were converted 1243 // from UTF-16 and are guaranteed to be well-formed.) 1244 if( !U8_IS_TRAIL(s[pos-dec]) && 1245 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1246 matches8(s+pos-dec, s8, length8) 1247 ) { 1248 maxDec=dec; // Longest match from latest end. 1249 maxOverlap=overlap; 1250 break; 1251 } 1252 --overlap; 1253 ++dec; 1254 } 1255 s8+=length8; 1256 } 1257 1258 if(maxDec!=0 || maxOverlap!=0) { 1259 // Longest-match algorithm, and there was a string match. 1260 // Simply continue before it. 1261 pos-=maxDec; 1262 if(pos==0) { 1263 return 0; // Reached the start of the string. 1264 } 1265 spanLength=0; // Match strings from before a string match. 1266 continue; 1267 } 1268 } 1269 // Finished trying to match all strings at pos. 1270 1271 if(spanLength!=0 || pos==length) { 1272 // The position is before an unlimited code point span (spanLength!=0), 1273 // not before a string match. 1274 // The only position where spanLength==0 before a span is pos==length. 1275 // Otherwise, an unlimited code point span is only tried again when no 1276 // strings match, and if such a non-initial span fails we stop. 1277 if(offsets.isEmpty()) { 1278 return pos; // No strings matched before a span. 1279 } 1280 // Match strings from before the next string match. 1281 } else { 1282 // The position is before a string match (or a single code point). 1283 if(offsets.isEmpty()) { 1284 // No more strings matched before a previous string match. 1285 // Try another code point span from before the last string match. 1286 int32_t oldPos=pos; 1287 pos = spanSet.spanBackUTF8(reinterpret_cast<const char*>(s), oldPos, USET_SPAN_CONTAINED); 1288 spanLength=oldPos-pos; 1289 if( pos==0 || // Reached the start of the string, or 1290 spanLength==0 // neither strings nor span progressed. 1291 ) { 1292 return pos; 1293 } 1294 continue; // spanLength>0: Match strings from before a span. 1295 } else { 1296 // Try to match only one code point from before a string match if some 1297 // string matched beyond it, so that we try all possible positions 1298 // and don't overshoot. 1299 spanLength=spanOneBackUTF8(spanSet, s, pos); 1300 if(spanLength>0) { 1301 if(spanLength==pos) { 1302 return 0; // Reached the start of the string. 1303 } 1304 // Match strings before this code point. 1305 // There cannot be any decrements below it because UnicodeSet strings 1306 // contain multiple code points. 1307 pos-=spanLength; 1308 offsets.shift(spanLength); 1309 spanLength=0; 1310 continue; // Match strings from before a single code point. 1311 } 1312 // Match strings from before the next string match. 1313 } 1314 } 1315 pos-=offsets.popMinimum(); 1316 spanLength=0; // Match strings from before a string match. 1317 } 1318 } 1319 1320 /* 1321 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1322 * 1323 * Theoretical algorithm: 1324 * - Iterate through the string, and at each code point boundary: 1325 * + If the code point there is in the set, then return with the current position. 1326 * + If a set string matches at the current position, then return with the current position. 1327 * 1328 * Optimized implementation: 1329 * 1330 * (Same assumption as for span() above.) 1331 * 1332 * Create and cache a spanNotSet which contains all of the single code points 1333 * of the original set but none of its strings. 1334 * For each set string add its initial code point to the spanNotSet. 1335 * (Also add its final code point for spanNotBack().) 1336 * 1337 * - Loop: 1338 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1339 * + If the current code point is in the original set, then 1340 * return the current position. 1341 * + If any set string matches at the current position, then 1342 * return the current position. 1343 * + If there is no match at the current position, neither for the code point there 1344 * nor for any set string, then skip this code point and continue the loop. 1345 * This happens for set-string-initial code points that were added to spanNotSet 1346 * when there is not actually a match for such a set string. 1347 */ 1348 1349 int32_t UnicodeSetStringSpan::spanNot(const char16_t *s, int32_t length) const { 1350 int32_t pos=0, rest=length; 1351 int32_t i, stringsLength=strings.size(); 1352 do { 1353 // Span until we find a code point from the set, 1354 // or a code point that starts or ends some string. 1355 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1356 if(i==rest) { 1357 return length; // Reached the end of the string. 1358 } 1359 pos+=i; 1360 rest-=i; 1361 1362 // Check whether the current code point is in the original set, 1363 // without the string starts and ends. 1364 int32_t cpLength=spanOne(spanSet, s+pos, rest); 1365 if(cpLength>0) { 1366 return pos; // There is a set element at pos. 1367 } 1368 1369 // Try to match the strings at pos. 1370 for(i=0; i<stringsLength; ++i) { 1371 if(spanLengths[i]==ALL_CP_CONTAINED) { 1372 continue; // Irrelevant string. (Also the empty string.) 1373 } 1374 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 1375 const char16_t *s16=string.getBuffer(); 1376 int32_t length16=string.length(); 1377 U_ASSERT(length>0); 1378 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1379 return pos; // There is a set element at pos. 1380 } 1381 } 1382 1383 // The span(while not contained) ended on a string start/end which is 1384 // not in the original set. Skip this code point and continue. 1385 // cpLength<0 1386 pos-=cpLength; 1387 rest+=cpLength; 1388 } while(rest!=0); 1389 return length; // Reached the end of the string. 1390 } 1391 1392 int32_t UnicodeSetStringSpan::spanNotBack(const char16_t *s, int32_t length) const { 1393 int32_t pos=length; 1394 int32_t i, stringsLength=strings.size(); 1395 do { 1396 // Span until we find a code point from the set, 1397 // or a code point that starts or ends some string. 1398 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1399 if(pos==0) { 1400 return 0; // Reached the start of the string. 1401 } 1402 1403 // Check whether the current code point is in the original set, 1404 // without the string starts and ends. 1405 int32_t cpLength=spanOneBack(spanSet, s, pos); 1406 if(cpLength>0) { 1407 return pos; // There is a set element at pos. 1408 } 1409 1410 // Try to match the strings at pos. 1411 for(i=0; i<stringsLength; ++i) { 1412 // Use spanLengths rather than a spanBackLengths pointer because 1413 // it is easier and we only need to know whether the string is irrelevant 1414 // which is the same in either array. 1415 if(spanLengths[i]==ALL_CP_CONTAINED) { 1416 continue; // Irrelevant string. (Also the empty string.) 1417 } 1418 const UnicodeString& string = *static_cast<const UnicodeString*>(strings.elementAt(i)); 1419 const char16_t *s16=string.getBuffer(); 1420 int32_t length16=string.length(); 1421 U_ASSERT(length>0); 1422 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1423 return pos; // There is a set element at pos. 1424 } 1425 } 1426 1427 // The span(while not contained) ended on a string start/end which is 1428 // not in the original set. Skip this code point and continue. 1429 // cpLength<0 1430 pos+=cpLength; 1431 } while(pos!=0); 1432 return 0; // Reached the start of the string. 1433 } 1434 1435 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1436 int32_t pos=0, rest=length; 1437 int32_t i, stringsLength=strings.size(); 1438 uint8_t *spanUTF8Lengths=spanLengths; 1439 if(all) { 1440 spanUTF8Lengths+=2*stringsLength; 1441 } 1442 do { 1443 // Span until we find a code point from the set, 1444 // or a code point that starts or ends some string. 1445 i = pSpanNotSet->spanUTF8(reinterpret_cast<const char*>(s) + pos, rest, USET_SPAN_NOT_CONTAINED); 1446 if(i==rest) { 1447 return length; // Reached the end of the string. 1448 } 1449 pos+=i; 1450 rest-=i; 1451 1452 // Check whether the current code point is in the original set, 1453 // without the string starts and ends. 1454 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1455 if(cpLength>0) { 1456 return pos; // There is a set element at pos. 1457 } 1458 1459 // Try to match the strings at pos. 1460 const uint8_t *s8=utf8; 1461 int32_t length8; 1462 for(i=0; i<stringsLength; ++i) { 1463 length8=utf8Lengths[i]; 1464 // ALL_CP_CONTAINED: Irrelevant string. 1465 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1466 return pos; // There is a set element at pos. 1467 } 1468 s8+=length8; 1469 } 1470 1471 // The span(while not contained) ended on a string start/end which is 1472 // not in the original set. Skip this code point and continue. 1473 // cpLength<0 1474 pos-=cpLength; 1475 rest+=cpLength; 1476 } while(rest!=0); 1477 return length; // Reached the end of the string. 1478 } 1479 1480 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1481 int32_t pos=length; 1482 int32_t i, stringsLength=strings.size(); 1483 uint8_t *spanBackUTF8Lengths=spanLengths; 1484 if(all) { 1485 spanBackUTF8Lengths+=3*stringsLength; 1486 } 1487 do { 1488 // Span until we find a code point from the set, 1489 // or a code point that starts or ends some string. 1490 pos = pSpanNotSet->spanBackUTF8(reinterpret_cast<const char*>(s), pos, USET_SPAN_NOT_CONTAINED); 1491 if(pos==0) { 1492 return 0; // Reached the start of the string. 1493 } 1494 1495 // Check whether the current code point is in the original set, 1496 // without the string starts and ends. 1497 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1498 if(cpLength>0) { 1499 return pos; // There is a set element at pos. 1500 } 1501 1502 // Try to match the strings at pos. 1503 const uint8_t *s8=utf8; 1504 int32_t length8; 1505 for(i=0; i<stringsLength; ++i) { 1506 length8=utf8Lengths[i]; 1507 // ALL_CP_CONTAINED: Irrelevant string. 1508 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1509 return pos; // There is a set element at pos. 1510 } 1511 s8+=length8; 1512 } 1513 1514 // The span(while not contained) ended on a string start/end which is 1515 // not in the original set. Skip this code point and continue. 1516 // cpLength<0 1517 pos+=cpLength; 1518 } while(pos!=0); 1519 return 0; // Reached the start of the string. 1520 } 1521 1522 U_NAMESPACE_END