bmpset.cpp (25240B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2007-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: bmpset.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007jan29 16 * created by: Markus W. Scherer 17 */ 18 19 #include "unicode/utypes.h" 20 #include "unicode/uniset.h" 21 #include "unicode/utf8.h" 22 #include "unicode/utf16.h" 23 #include "cmemory.h" 24 #include "bmpset.h" 25 #include "uassert.h" 26 27 U_NAMESPACE_BEGIN 28 29 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : 30 list(parentList), listLength(parentListLength) { 31 uprv_memset(latin1Contains, 0, sizeof(latin1Contains)); 32 uprv_memset(table7FF, 0, sizeof(table7FF)); 33 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); 34 35 /* 36 * Set the list indexes for binary searches for 37 * U+0800, U+1000, U+2000, .., U+F000, U+10000. 38 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are 39 * looked up in the bit tables. 40 * The last pair of indexes is for finding supplementary code points. 41 */ 42 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); 43 int32_t i; 44 for(i=1; i<=0x10; ++i) { 45 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); 46 } 47 list4kStarts[0x11]=listLength-1; 48 containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]); 49 50 initBits(); 51 overrideIllegal(); 52 } 53 54 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : 55 containsFFFD(otherBMPSet.containsFFFD), 56 list(newParentList), listLength(newParentListLength) { 57 uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains)); 58 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); 59 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); 60 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); 61 } 62 63 BMPSet::~BMPSet() { 64 } 65 66 /* 67 * Set bits in a bit rectangle in "vertical" bit organization. 68 * start<limit<=0x800 69 */ 70 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { 71 U_ASSERT(start<limit); 72 U_ASSERT(limit<=0x800); 73 74 int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 75 int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 76 77 // Set one bit indicating an all-one block. 78 uint32_t bits = static_cast<uint32_t>(1) << lead; 79 if((start+1)==limit) { // Single-character shortcut. 80 table[trail]|=bits; 81 return; 82 } 83 84 int32_t limitLead=limit>>6; 85 int32_t limitTrail=limit&0x3f; 86 87 if(lead==limitLead) { 88 // Partial vertical bit column. 89 while(trail<limitTrail) { 90 table[trail++]|=bits; 91 } 92 } else { 93 // Partial vertical bit column, 94 // followed by a bit rectangle, 95 // followed by another partial vertical bit column. 96 if(trail>0) { 97 do { 98 table[trail++]|=bits; 99 } while(trail<64); 100 ++lead; 101 } 102 if(lead<limitLead) { 103 bits = ~((static_cast<unsigned>(1) << lead) - 1); 104 if(limitLead<0x20) { 105 bits &= (static_cast<unsigned>(1) << limitLead) - 1; 106 } 107 for(trail=0; trail<64; ++trail) { 108 table[trail]|=bits; 109 } 110 } 111 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 112 // In that case, bits=1<<limitLead is undefined but the bits value 113 // is not used because trail<limitTrail is already false. 114 bits = static_cast<uint32_t>(1) << ((limitLead == 0x20) ? (limitLead - 1) : limitLead); 115 for(trail=0; trail<limitTrail; ++trail) { 116 table[trail]|=bits; 117 } 118 } 119 } 120 121 void BMPSet::initBits() { 122 UChar32 start, limit; 123 int32_t listIndex=0; 124 125 // Set latin1Contains[]. 126 do { 127 start=list[listIndex++]; 128 if(listIndex<listLength) { 129 limit=list[listIndex++]; 130 } else { 131 limit=0x110000; 132 } 133 if(start>=0x100) { 134 break; 135 } 136 do { 137 latin1Contains[start++]=1; 138 } while(start<limit && start<0x100); 139 } while(limit<=0x100); 140 141 // Find the first range overlapping with (or after) 80..FF again, 142 // to include them in table7FF as well. 143 for(listIndex=0;;) { 144 start=list[listIndex++]; 145 if(listIndex<listLength) { 146 limit=list[listIndex++]; 147 } else { 148 limit=0x110000; 149 } 150 if(limit>0x80) { 151 if(start<0x80) { 152 start=0x80; 153 } 154 break; 155 } 156 } 157 158 // Set table7FF[]. 159 while(start<0x800) { 160 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); 161 if(limit>0x800) { 162 start=0x800; 163 break; 164 } 165 166 start=list[listIndex++]; 167 if(listIndex<listLength) { 168 limit=list[listIndex++]; 169 } else { 170 limit=0x110000; 171 } 172 } 173 174 // Set bmpBlockBits[]. 175 int32_t minStart=0x800; 176 while(start<0x10000) { 177 if(limit>0x10000) { 178 limit=0x10000; 179 } 180 181 if(start<minStart) { 182 start=minStart; 183 } 184 if(start<limit) { // Else: Another range entirely in a known mixed-value block. 185 if(start&0x3f) { 186 // Mixed-value block of 64 code points. 187 start>>=6; 188 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); 189 start=(start+1)<<6; // Round up to the next block boundary. 190 minStart=start; // Ignore further ranges in this block. 191 } 192 if(start<limit) { 193 if(start<(limit&~0x3f)) { 194 // Multiple all-ones blocks of 64 code points each. 195 set32x64Bits(bmpBlockBits, start>>6, limit>>6); 196 } 197 198 if(limit&0x3f) { 199 // Mixed-value block of 64 code points. 200 limit>>=6; 201 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); 202 limit=(limit+1)<<6; // Round up to the next block boundary. 203 minStart=limit; // Ignore further ranges in this block. 204 } 205 } 206 } 207 208 if(limit==0x10000) { 209 break; 210 } 211 212 start=list[listIndex++]; 213 if(listIndex<listLength) { 214 limit=list[listIndex++]; 215 } else { 216 limit=0x110000; 217 } 218 } 219 } 220 221 /* 222 * Override some bits and bytes to the result of contains(FFFD) 223 * for faster validity checking at runtime. 224 * No need to set 0 values where they were reset to 0 in the constructor 225 * and not modified by initBits(). 226 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF) 227 * Need to set 0 values for surrogates D800..DFFF. 228 */ 229 void BMPSet::overrideIllegal() { 230 uint32_t bits, mask; 231 int32_t i; 232 233 if(containsFFFD) { 234 bits=3; // Lead bytes 0xC0 and 0xC1. 235 for(i=0; i<64; ++i) { 236 table7FF[i]|=bits; 237 } 238 239 bits=1; // Lead byte 0xE0. 240 for(i=0; i<32; ++i) { // First half of 4k block. 241 bmpBlockBits[i]|=bits; 242 } 243 244 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED. 245 bits=1<<0xd; 246 for(i=32; i<64; ++i) { // Second half of 4k block. 247 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; 248 } 249 } else { 250 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED. 251 for(i=32; i<64; ++i) { // Second half of 4k block. 252 bmpBlockBits[i]&=mask; 253 } 254 } 255 } 256 257 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { 258 /* Examples: 259 findCodePoint(c) 260 set list[] c=0 1 3 4 7 8 261 === ============== =========== 262 [] [110000] 0 0 0 0 0 0 263 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 264 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 265 [:Any:] [0, 110000] 1 1 1 1 1 1 266 */ 267 268 // Return the smallest i such that c < list[i]. Assume 269 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 270 if (c < list[lo]) 271 return lo; 272 // High runner test. c is often after the last range, so an 273 // initial check for this condition pays off. 274 if (lo >= hi || c >= list[hi-1]) 275 return hi; 276 // invariant: c >= list[lo] 277 // invariant: c < list[hi] 278 for (;;) { 279 int32_t i = (lo + hi) >> 1; 280 if (i == lo) { 281 break; // Found! 282 } else if (c < list[i]) { 283 hi = i; 284 } else { 285 lo = i; 286 } 287 } 288 return hi; 289 } 290 291 UBool 292 BMPSet::contains(UChar32 c) const { 293 if (static_cast<uint32_t>(c) <= 0xff) { 294 return latin1Contains[c]; 295 } else if (static_cast<uint32_t>(c) <= 0x7ff) { 296 return (table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0; 297 } else if (static_cast<uint32_t>(c) < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 298 int lead=c>>12; 299 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 300 if(twoBits<=1) { 301 // All 64 code points with the same bits 15..6 302 // are either in the set or not. 303 return twoBits; 304 } else { 305 // Look up the code point in its 4k block of code points. 306 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); 307 } 308 } else if (static_cast<uint32_t>(c) <= 0x10ffff) { 309 // surrogate or supplementary code point 310 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 311 } else { 312 // Out-of-range code points get false, consistent with long-standing 313 // behavior of UnicodeSet::contains(c). 314 return false; 315 } 316 } 317 318 /* 319 * Check for sufficient length for trail unit for each surrogate pair. 320 * Handle single surrogates as surrogate code points as usual in ICU. 321 */ 322 const char16_t * 323 BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const { 324 char16_t c, c2; 325 326 if(spanCondition) { 327 // span 328 do { 329 c=*s; 330 if(c<=0xff) { 331 if(!latin1Contains[c]) { 332 break; 333 } 334 } else if(c<=0x7ff) { 335 if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) { 336 break; 337 } 338 } else if(c<0xd800 || c>=0xe000) { 339 int lead=c>>12; 340 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 341 if(twoBits<=1) { 342 // All 64 code points with the same bits 15..6 343 // are either in the set or not. 344 if(twoBits==0) { 345 break; 346 } 347 } else { 348 // Look up the code point in its 4k block of code points. 349 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 350 break; 351 } 352 } 353 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 354 // surrogate code point 355 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 356 break; 357 } 358 } else { 359 // surrogate pair 360 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 361 break; 362 } 363 ++s; 364 } 365 } while(++s<limit); 366 } else { 367 // span not 368 do { 369 c=*s; 370 if(c<=0xff) { 371 if(latin1Contains[c]) { 372 break; 373 } 374 } else if(c<=0x7ff) { 375 if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) { 376 break; 377 } 378 } else if(c<0xd800 || c>=0xe000) { 379 int lead=c>>12; 380 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 381 if(twoBits<=1) { 382 // All 64 code points with the same bits 15..6 383 // are either in the set or not. 384 if(twoBits!=0) { 385 break; 386 } 387 } else { 388 // Look up the code point in its 4k block of code points. 389 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 390 break; 391 } 392 } 393 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 394 // surrogate code point 395 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 396 break; 397 } 398 } else { 399 // surrogate pair 400 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 401 break; 402 } 403 ++s; 404 } 405 } while(++s<limit); 406 } 407 return s; 408 } 409 410 /* Symmetrical with span(). */ 411 const char16_t * 412 BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const { 413 char16_t c, c2; 414 415 if(spanCondition) { 416 // span 417 for(;;) { 418 c=*(--limit); 419 if(c<=0xff) { 420 if(!latin1Contains[c]) { 421 break; 422 } 423 } else if(c<=0x7ff) { 424 if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) { 425 break; 426 } 427 } else if(c<0xd800 || c>=0xe000) { 428 int lead=c>>12; 429 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 430 if(twoBits<=1) { 431 // All 64 code points with the same bits 15..6 432 // are either in the set or not. 433 if(twoBits==0) { 434 break; 435 } 436 } else { 437 // Look up the code point in its 4k block of code points. 438 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 439 break; 440 } 441 } 442 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 443 // surrogate code point 444 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 445 break; 446 } 447 } else { 448 // surrogate pair 449 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 450 break; 451 } 452 --limit; 453 } 454 if(s==limit) { 455 return s; 456 } 457 } 458 } else { 459 // span not 460 for(;;) { 461 c=*(--limit); 462 if(c<=0xff) { 463 if(latin1Contains[c]) { 464 break; 465 } 466 } else if(c<=0x7ff) { 467 if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) { 468 break; 469 } 470 } else if(c<0xd800 || c>=0xe000) { 471 int lead=c>>12; 472 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 473 if(twoBits<=1) { 474 // All 64 code points with the same bits 15..6 475 // are either in the set or not. 476 if(twoBits!=0) { 477 break; 478 } 479 } else { 480 // Look up the code point in its 4k block of code points. 481 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 482 break; 483 } 484 } 485 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 486 // surrogate code point 487 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 488 break; 489 } 490 } else { 491 // surrogate pair 492 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 493 break; 494 } 495 --limit; 496 } 497 if(s==limit) { 498 return s; 499 } 500 } 501 } 502 return limit+1; 503 } 504 505 /* 506 * Precheck for sufficient trail bytes at end of string only once per span. 507 * Check validity. 508 */ 509 const uint8_t * 510 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 511 const uint8_t *limit=s+length; 512 uint8_t b=*s; 513 if(U8_IS_SINGLE(b)) { 514 // Initial all-ASCII span. 515 if(spanCondition) { 516 do { 517 if(!latin1Contains[b] || ++s==limit) { 518 return s; 519 } 520 b=*s; 521 } while(U8_IS_SINGLE(b)); 522 } else { 523 do { 524 if(latin1Contains[b] || ++s==limit) { 525 return s; 526 } 527 b=*s; 528 } while(U8_IS_SINGLE(b)); 529 } 530 length = static_cast<int32_t>(limit - s); 531 } 532 533 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 534 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 535 } 536 537 const uint8_t *limit0=limit; 538 539 /* 540 * Make sure that the last 1/2/3/4-byte sequence before limit is complete 541 * or runs into a lead byte. 542 * In the span loop compare s with limit only once 543 * per multi-byte character. 544 * 545 * Give a trailing illegal sequence the same value as the result of contains(FFFD), 546 * including it if that is part of the span, otherwise set limit0 to before 547 * the truncated sequence. 548 */ 549 b=*(limit-1); 550 if (static_cast<int8_t>(b) < 0) { 551 // b>=0x80: lead or trail byte 552 if(b<0xc0) { 553 // single trail byte, check for preceding 3- or 4-byte lead byte 554 if(length>=2 && (b=*(limit-2))>=0xe0) { 555 limit-=2; 556 if(containsFFFD!=spanCondition) { 557 limit0=limit; 558 } 559 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { 560 // 4-byte lead byte with only two trail bytes 561 limit-=3; 562 if(containsFFFD!=spanCondition) { 563 limit0=limit; 564 } 565 } 566 } else { 567 // lead byte with no trail bytes 568 --limit; 569 if(containsFFFD!=spanCondition) { 570 limit0=limit; 571 } 572 } 573 } 574 575 uint8_t t1, t2, t3; 576 577 while(s<limit) { 578 b=*s; 579 if(U8_IS_SINGLE(b)) { 580 // ASCII 581 if(spanCondition) { 582 do { 583 if(!latin1Contains[b]) { 584 return s; 585 } else if(++s==limit) { 586 return limit0; 587 } 588 b=*s; 589 } while(U8_IS_SINGLE(b)); 590 } else { 591 do { 592 if(latin1Contains[b]) { 593 return s; 594 } else if(++s==limit) { 595 return limit0; 596 } 597 b=*s; 598 } while(U8_IS_SINGLE(b)); 599 } 600 } 601 ++s; // Advance past the lead byte. 602 if(b>=0xe0) { 603 if(b<0xf0) { 604 if( /* handle U+0000..U+FFFF inline */ 605 (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f && 606 (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f 607 ) { 608 b&=0xf; 609 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; 610 if(twoBits<=1) { 611 // All 64 code points with this lead byte and middle trail byte 612 // are either in the set or not. 613 if (twoBits != static_cast<uint32_t>(spanCondition)) { 614 return s-1; 615 } 616 } else { 617 // Look up the code point in its 4k block of code points. 618 UChar32 c=(b<<12)|(t1<<6)|t2; 619 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { 620 return s-1; 621 } 622 } 623 s+=2; 624 continue; 625 } 626 } else if( /* handle U+10000..U+10FFFF inline */ 627 (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f && 628 (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f && 629 (t3 = static_cast<uint8_t>(s[2] - 0x80)) <= 0x3f 630 ) { 631 // Give an illegal sequence the same value as the result of contains(FFFD). 632 UChar32 c = (static_cast<UChar32>(b - 0xf0) << 18) | (static_cast<UChar32>(t1) << 12) | (t2 << 6) | t3; 633 if( ( (0x10000<=c && c<=0x10ffff) ? 634 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : 635 containsFFFD 636 ) != spanCondition 637 ) { 638 return s-1; 639 } 640 s+=3; 641 continue; 642 } 643 } else { 644 if( /* handle U+0000..U+07FF inline */ 645 b>=0xc0 && 646 (t1 = static_cast<uint8_t>(*s - 0x80)) <= 0x3f 647 ) { 648 if (static_cast<USetSpanCondition>((table7FF[t1] & (static_cast<uint32_t>(1) << (b & 0x1f))) != 0) != spanCondition) { 649 return s-1; 650 } 651 ++s; 652 continue; 653 } 654 } 655 656 // Give an illegal sequence the same value as the result of contains(FFFD). 657 // Handle each byte of an illegal sequence separately to simplify the code; 658 // no need to optimize error handling. 659 if(containsFFFD!=spanCondition) { 660 return s-1; 661 } 662 } 663 664 return limit0; 665 } 666 667 /* 668 * While going backwards through UTF-8 optimize only for ASCII. 669 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not 670 * possible to tell from the last byte in a multi-byte sequence how many 671 * preceding bytes there should be. Therefore, going backwards through UTF-8 672 * is much harder than going forward. 673 */ 674 int32_t 675 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 676 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 677 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 678 } 679 680 uint8_t b; 681 682 do { 683 b=s[--length]; 684 if(U8_IS_SINGLE(b)) { 685 // ASCII sub-span 686 if(spanCondition) { 687 do { 688 if(!latin1Contains[b]) { 689 return length+1; 690 } else if(length==0) { 691 return 0; 692 } 693 b=s[--length]; 694 } while(U8_IS_SINGLE(b)); 695 } else { 696 do { 697 if(latin1Contains[b]) { 698 return length+1; 699 } else if(length==0) { 700 return 0; 701 } 702 b=s[--length]; 703 } while(U8_IS_SINGLE(b)); 704 } 705 } 706 707 int32_t prev=length; 708 UChar32 c; 709 // trail byte: collect a multi-byte character 710 // (or lead byte in last-trail position) 711 c=utf8_prevCharSafeBody(s, 0, &length, b, -3); 712 // c is a valid code point, not ASCII, not a surrogate 713 if(c<=0x7ff) { 714 if (static_cast<USetSpanCondition>((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) != spanCondition) { 715 return prev+1; 716 } 717 } else if(c<=0xffff) { 718 int lead=c>>12; 719 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 720 if(twoBits<=1) { 721 // All 64 code points with the same bits 15..6 722 // are either in the set or not. 723 if (twoBits != static_cast<uint32_t>(spanCondition)) { 724 return prev+1; 725 } 726 } else { 727 // Look up the code point in its 4k block of code points. 728 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { 729 return prev+1; 730 } 731 } 732 } else { 733 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { 734 return prev+1; 735 } 736 } 737 } while(length>0); 738 return 0; 739 } 740 741 U_NAMESPACE_END