uniset_props.cpp (39796B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uniset_props.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004aug25 16 * created by: Markus W. Scherer 17 * 18 * Character property dependent functions moved here from uniset.cpp 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uniset.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/uchar.h" 25 #include "unicode/uscript.h" 26 #include "unicode/symtable.h" 27 #include "unicode/uset.h" 28 #include "unicode/locid.h" 29 #include "unicode/brkiter.h" 30 #include "uset_imp.h" 31 #include "ruleiter.h" 32 #include "cmemory.h" 33 #include "ucln_cmn.h" 34 #include "util.h" 35 #include "uvector.h" 36 #include "uprops.h" 37 #include "propname.h" 38 #include "normalizer2impl.h" 39 #include "uinvchar.h" 40 #include "uprops.h" 41 #include "charstr.h" 42 #include "cstring.h" 43 #include "mutex.h" 44 #include "umutex.h" 45 #include "uassert.h" 46 #include "hash.h" 47 48 U_NAMESPACE_USE 49 50 namespace { 51 52 // Special property set IDs 53 constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 54 constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F] 55 constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:] 56 57 // Unicode name property alias 58 constexpr char16_t NAME_PROP[] = u"na"; 59 60 } // namespace 61 62 // Cached sets ------------------------------------------------------------- *** 63 64 U_CDECL_BEGIN 65 static UBool U_CALLCONV uset_cleanup(); 66 67 static UnicodeSet *uni32Singleton; 68 static icu::UInitOnce uni32InitOnce {}; 69 70 /** 71 * Cleanup function for UnicodeSet 72 */ 73 static UBool U_CALLCONV uset_cleanup() { 74 delete uni32Singleton; 75 uni32Singleton = nullptr; 76 uni32InitOnce.reset(); 77 return true; 78 } 79 80 U_CDECL_END 81 82 U_NAMESPACE_BEGIN 83 84 namespace { 85 86 // Cache some sets for other services -------------------------------------- *** 87 void U_CALLCONV createUni32Set(UErrorCode &errorCode) { 88 U_ASSERT(uni32Singleton == nullptr); 89 uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode); 90 if(uni32Singleton==nullptr) { 91 errorCode=U_MEMORY_ALLOCATION_ERROR; 92 } else { 93 uni32Singleton->freeze(); 94 } 95 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 96 } 97 98 99 U_CFUNC UnicodeSet * 100 uniset_getUnicode32Instance(UErrorCode &errorCode) { 101 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); 102 return uni32Singleton; 103 } 104 105 // helper functions for matching of pattern syntax pieces ------------------ *** 106 // these functions are parallel to the PERL_OPEN etc. strings above 107 108 // using these functions is not only faster than UnicodeString::compare() and 109 // caseCompare(), but they also make UnicodeSet work for simple patterns when 110 // no Unicode properties data is available - when caseCompare() fails 111 112 inline UBool 113 isPerlOpen(const UnicodeString &pattern, int32_t pos) { 114 char16_t c; 115 return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P'); 116 } 117 118 /*static inline UBool 119 isPerlClose(const UnicodeString &pattern, int32_t pos) { 120 return pattern.charAt(pos)==u'}'; 121 }*/ 122 123 inline UBool 124 isNameOpen(const UnicodeString &pattern, int32_t pos) { 125 return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N'; 126 } 127 128 inline UBool 129 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 130 return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':'; 131 } 132 133 /*static inline UBool 134 isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 135 return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']'; 136 }*/ 137 138 // TODO memory debugging provided inside uniset.cpp 139 // could be made available here but probably obsolete with use of modern 140 // memory leak checker tools 141 #define _dbgct(me) 142 143 } // namespace 144 145 //---------------------------------------------------------------- 146 // Constructors &c 147 //---------------------------------------------------------------- 148 149 /** 150 * Constructs a set from the given pattern, optionally ignoring 151 * white space. See the class description for the syntax of the 152 * pattern language. 153 * @param pattern a string specifying what characters are in the set 154 */ 155 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 156 UErrorCode& status) { 157 applyPattern(pattern, status); 158 _dbgct(this); 159 } 160 161 //---------------------------------------------------------------- 162 // Public API 163 //---------------------------------------------------------------- 164 165 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 166 UErrorCode& status) { 167 // Equivalent to 168 // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status); 169 // but without dependency on closeOver(). 170 ParsePosition pos(0); 171 applyPatternIgnoreSpace(pattern, pos, nullptr, status); 172 if (U_FAILURE(status)) return *this; 173 174 int32_t i = pos.getIndex(); 175 // Skip over trailing whitespace 176 ICU_Utility::skipWhitespace(pattern, i, true); 177 if (i != pattern.length()) { 178 status = U_ILLEGAL_ARGUMENT_ERROR; 179 } 180 return *this; 181 } 182 183 void 184 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, 185 ParsePosition& pos, 186 const SymbolTable* symbols, 187 UErrorCode& status) { 188 if (U_FAILURE(status)) { 189 return; 190 } 191 if (isFrozen()) { 192 status = U_NO_WRITE_PERMISSION; 193 return; 194 } 195 // Need to build the pattern in a temporary string because 196 // _applyPattern calls add() etc., which set pat to empty. 197 UnicodeString rebuiltPat; 198 RuleCharacterIterator chars(pattern, symbols, pos); 199 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status); 200 if (U_FAILURE(status)) return; 201 if (chars.inVariable()) { 202 // syntaxError(chars, "Extra chars in variable value"); 203 status = U_MALFORMED_SET; 204 return; 205 } 206 setPattern(rebuiltPat); 207 } 208 209 /** 210 * Return true if the given position, in the given pattern, appears 211 * to be the start of a UnicodeSet pattern. 212 */ 213 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 214 return ((pos+1) < pattern.length() && 215 pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) || 216 resemblesPropertyPattern(pattern, pos); 217 } 218 219 //---------------------------------------------------------------- 220 // Implementation: Pattern parsing 221 //---------------------------------------------------------------- 222 223 namespace { 224 225 /** 226 * A small all-inline class to manage a UnicodeSet pointer. Add 227 * operator->() etc. as needed. 228 */ 229 class UnicodeSetPointer { 230 UnicodeSet* p; 231 public: 232 inline UnicodeSetPointer() : p(nullptr) {} 233 inline ~UnicodeSetPointer() { delete p; } 234 inline UnicodeSet* pointer() { return p; } 235 inline UBool allocate() { 236 if (p == nullptr) { 237 p = new UnicodeSet(); 238 } 239 return p != nullptr; 240 } 241 }; 242 243 constexpr int32_t MAX_DEPTH = 100; 244 245 } // namespace 246 247 /** 248 * Parse the pattern from the given RuleCharacterIterator. The 249 * iterator is advanced over the parsed pattern. 250 * @param chars iterator over the pattern characters. Upon return 251 * it will be advanced to the first character after the parsed 252 * pattern, or the end of the iteration if all characters are 253 * parsed. 254 * @param symbols symbol table to use to parse and dereference 255 * variables, or null if none. 256 * @param rebuiltPat the pattern that was parsed, rebuilt or 257 * copied from the input pattern, as appropriate. 258 * @param options a bit mask of zero or more of the following: 259 * IGNORE_SPACE, CASE. 260 */ 261 void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 262 const SymbolTable* symbols, 263 UnicodeString& rebuiltPat, 264 uint32_t options, 265 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 266 int32_t depth, 267 UErrorCode& ec) { 268 if (U_FAILURE(ec)) return; 269 if (depth > MAX_DEPTH) { 270 ec = U_ILLEGAL_ARGUMENT_ERROR; 271 return; 272 } 273 274 // Syntax characters: [ ] ^ - & { } 275 276 // Recognized special forms for chars, sets: c-c s-s s&s 277 278 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 279 RuleCharacterIterator::PARSE_ESCAPES; 280 if ((options & USET_IGNORE_SPACE) != 0) { 281 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 282 } 283 284 UnicodeString patLocal, buf; 285 UBool usePat = false; 286 UnicodeSetPointer scratch; 287 RuleCharacterIterator::Pos backup; 288 289 // mode: 0=before [, 1=between [...], 2=after ] 290 // lastItem: 0=none, 1=char, 2=set 291 int8_t lastItem = 0, mode = 0; 292 UChar32 lastChar = 0; 293 char16_t op = 0; 294 295 UBool invert = false; 296 297 clear(); 298 299 while (mode != 2 && !chars.atEnd()) { 300 U_ASSERT((lastItem == 0 && op == 0) || 301 (lastItem == 1 && (op == 0 || op == u'-')) || 302 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&'))); 303 304 UChar32 c = 0; 305 UBool literal = false; 306 UnicodeSet* nested = nullptr; // alias - do not delete 307 308 // -------- Check for property pattern 309 310 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 311 int8_t setMode = 0; 312 if (resemblesPropertyPattern(chars, opts)) { 313 setMode = 2; 314 } 315 316 // -------- Parse '[' of opening delimiter OR nested set. 317 // If there is a nested set, use `setMode' to define how 318 // the set should be parsed. If the '[' is part of the 319 // opening delimiter for this pattern, parse special 320 // strings "[", "[^", "[-", and "[^-". Check for stand-in 321 // characters representing a nested set in the symbol 322 // table. 323 324 else { 325 // Prepare to backup if necessary 326 chars.getPos(backup); 327 c = chars.next(opts, literal, ec); 328 if (U_FAILURE(ec)) return; 329 330 if (c == u'[' && !literal) { 331 if (mode == 1) { 332 chars.setPos(backup); // backup 333 setMode = 1; 334 } else { 335 // Handle opening '[' delimiter 336 mode = 1; 337 patLocal.append(u'['); 338 chars.getPos(backup); // prepare to backup 339 c = chars.next(opts, literal, ec); 340 if (U_FAILURE(ec)) return; 341 if (c == u'^' && !literal) { 342 invert = true; 343 patLocal.append(u'^'); 344 chars.getPos(backup); // prepare to backup 345 c = chars.next(opts, literal, ec); 346 if (U_FAILURE(ec)) return; 347 } 348 // Fall through to handle special leading '-'; 349 // otherwise restart loop for nested [], \p{}, etc. 350 if (c == u'-') { 351 literal = true; 352 // Fall through to handle literal '-' below 353 } else { 354 chars.setPos(backup); // backup 355 continue; 356 } 357 } 358 } else if (symbols != nullptr) { 359 const UnicodeFunctor *m = symbols->lookupMatcher(c); 360 if (m != nullptr) { 361 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 362 if (ms == nullptr) { 363 ec = U_MALFORMED_SET; 364 return; 365 } 366 // casting away const, but `nested' won't be modified 367 // (important not to modify stored set) 368 nested = const_cast<UnicodeSet*>(ms); 369 setMode = 3; 370 } 371 } 372 } 373 374 // -------- Handle a nested set. This either is inline in 375 // the pattern or represented by a stand-in that has 376 // previously been parsed and was looked up in the symbol 377 // table. 378 379 if (setMode != 0) { 380 if (lastItem == 1) { 381 if (op != 0) { 382 // syntaxError(chars, "Char expected after operator"); 383 ec = U_MALFORMED_SET; 384 return; 385 } 386 add(lastChar, lastChar); 387 _appendToPat(patLocal, lastChar, false); 388 lastItem = 0; 389 op = 0; 390 } 391 392 if (op == u'-' || op == u'&') { 393 patLocal.append(op); 394 } 395 396 if (nested == nullptr) { 397 // lazy allocation 398 if (!scratch.allocate()) { 399 ec = U_MEMORY_ALLOCATION_ERROR; 400 return; 401 } 402 nested = scratch.pointer(); 403 } 404 switch (setMode) { 405 case 1: 406 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec); 407 break; 408 case 2: 409 chars.skipIgnored(opts); 410 nested->applyPropertyPattern(chars, patLocal, ec); 411 if (U_FAILURE(ec)) return; 412 break; 413 case 3: // `nested' already parsed 414 nested->_toPattern(patLocal, false); 415 break; 416 } 417 418 usePat = true; 419 420 if (mode == 0) { 421 // Entire pattern is a category; leave parse loop 422 *this = *nested; 423 mode = 2; 424 break; 425 } 426 427 switch (op) { 428 case u'-': 429 removeAll(*nested); 430 break; 431 case u'&': 432 retainAll(*nested); 433 break; 434 case 0: 435 addAll(*nested); 436 break; 437 } 438 439 op = 0; 440 lastItem = 2; 441 442 continue; 443 } 444 445 if (mode == 0) { 446 // syntaxError(chars, "Missing '['"); 447 ec = U_MALFORMED_SET; 448 return; 449 } 450 451 // -------- Parse special (syntax) characters. If the 452 // current character is not special, or if it is escaped, 453 // then fall through and handle it below. 454 455 if (!literal) { 456 switch (c) { 457 case u']': 458 if (lastItem == 1) { 459 add(lastChar, lastChar); 460 _appendToPat(patLocal, lastChar, false); 461 } 462 // Treat final trailing '-' as a literal 463 if (op == u'-') { 464 add(op, op); 465 patLocal.append(op); 466 } else if (op == u'&') { 467 // syntaxError(chars, "Trailing '&'"); 468 ec = U_MALFORMED_SET; 469 return; 470 } 471 patLocal.append(u']'); 472 mode = 2; 473 continue; 474 case u'-': 475 if (op == 0) { 476 if (lastItem != 0) { 477 op = static_cast<char16_t>(c); 478 continue; 479 } else { 480 // Treat final trailing '-' as a literal 481 add(c, c); 482 c = chars.next(opts, literal, ec); 483 if (U_FAILURE(ec)) return; 484 if (c == u']' && !literal) { 485 patLocal.append(u"-]", 2); 486 mode = 2; 487 continue; 488 } 489 } 490 } 491 // syntaxError(chars, "'-' not after char or set"); 492 ec = U_MALFORMED_SET; 493 return; 494 case u'&': 495 if (lastItem == 2 && op == 0) { 496 op = static_cast<char16_t>(c); 497 continue; 498 } 499 // syntaxError(chars, "'&' not after set"); 500 ec = U_MALFORMED_SET; 501 return; 502 case u'^': 503 // syntaxError(chars, "'^' not after '['"); 504 ec = U_MALFORMED_SET; 505 return; 506 case u'{': 507 if (op != 0) { 508 // syntaxError(chars, "Missing operand after operator"); 509 ec = U_MALFORMED_SET; 510 return; 511 } 512 if (lastItem == 1) { 513 add(lastChar, lastChar); 514 _appendToPat(patLocal, lastChar, false); 515 } 516 lastItem = 0; 517 buf.truncate(0); 518 { 519 UBool ok = false; 520 while (!chars.atEnd()) { 521 c = chars.next(opts, literal, ec); 522 if (U_FAILURE(ec)) return; 523 if (c == u'}' && !literal) { 524 ok = true; 525 break; 526 } 527 buf.append(c); 528 } 529 if (!ok) { 530 // syntaxError(chars, "Invalid multicharacter string"); 531 ec = U_MALFORMED_SET; 532 return; 533 } 534 } 535 // We have new string. Add it to set and continue; 536 // we don't need to drop through to the further 537 // processing 538 add(buf); 539 patLocal.append(u'{'); 540 _appendToPat(patLocal, buf, false); 541 patLocal.append(u'}'); 542 continue; 543 case SymbolTable::SYMBOL_REF: 544 // symbols nosymbols 545 // [a-$] error error (ambiguous) 546 // [a$] anchor anchor 547 // [a-$x] var "x"* literal '$' 548 // [a-$.] error literal '$' 549 // *We won't get here in the case of var "x" 550 { 551 chars.getPos(backup); 552 c = chars.next(opts, literal, ec); 553 if (U_FAILURE(ec)) return; 554 UBool anchor = (c == u']' && !literal); 555 if (symbols == nullptr && !anchor) { 556 c = SymbolTable::SYMBOL_REF; 557 chars.setPos(backup); 558 break; // literal '$' 559 } 560 if (anchor && op == 0) { 561 if (lastItem == 1) { 562 add(lastChar, lastChar); 563 _appendToPat(patLocal, lastChar, false); 564 } 565 add(U_ETHER); 566 usePat = true; 567 patLocal.append(static_cast<char16_t>(SymbolTable::SYMBOL_REF)); 568 patLocal.append(u']'); 569 mode = 2; 570 continue; 571 } 572 // syntaxError(chars, "Unquoted '$'"); 573 ec = U_MALFORMED_SET; 574 return; 575 } 576 default: 577 break; 578 } 579 } 580 581 // -------- Parse literal characters. This includes both 582 // escaped chars ("\u4E01") and non-syntax characters 583 // ("a"). 584 585 switch (lastItem) { 586 case 0: 587 lastItem = 1; 588 lastChar = c; 589 break; 590 case 1: 591 if (op == u'-') { 592 if (lastChar >= c) { 593 // Don't allow redundant (a-a) or empty (b-a) ranges; 594 // these are most likely typos. 595 // syntaxError(chars, "Invalid range"); 596 ec = U_MALFORMED_SET; 597 return; 598 } 599 add(lastChar, c); 600 _appendToPat(patLocal, lastChar, false); 601 patLocal.append(op); 602 _appendToPat(patLocal, c, false); 603 lastItem = 0; 604 op = 0; 605 } else { 606 add(lastChar, lastChar); 607 _appendToPat(patLocal, lastChar, false); 608 lastChar = c; 609 } 610 break; 611 case 2: 612 if (op != 0) { 613 // syntaxError(chars, "Set expected after operator"); 614 ec = U_MALFORMED_SET; 615 return; 616 } 617 lastChar = c; 618 lastItem = 1; 619 break; 620 } 621 } 622 623 if (mode != 2) { 624 // syntaxError(chars, "Missing ']'"); 625 ec = U_MALFORMED_SET; 626 return; 627 } 628 629 chars.skipIgnored(opts); 630 631 /** 632 * Handle global flags (invert, case insensitivity). If this 633 * pattern should be compiled case-insensitive, then we need 634 * to close over case BEFORE COMPLEMENTING. This makes 635 * patterns like /[^abc]/i work. 636 */ 637 if ((options & USET_CASE_MASK) != 0) { 638 (this->*caseClosure)(options); 639 } 640 if (invert) { 641 complement().removeAllStrings(); // code point complement 642 } 643 644 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 645 // generated pattern. 646 if (usePat) { 647 rebuiltPat.append(patLocal); 648 } else { 649 _generatePattern(rebuiltPat, false); 650 } 651 if (isBogus() && U_SUCCESS(ec)) { 652 // We likely ran out of memory. AHHH! 653 ec = U_MEMORY_ALLOCATION_ERROR; 654 } 655 } 656 657 //---------------------------------------------------------------- 658 // Property set implementation 659 //---------------------------------------------------------------- 660 661 namespace { 662 663 UBool numericValueFilter(UChar32 ch, void* context) { 664 return u_getNumericValue(ch) == *static_cast<double*>(context); 665 } 666 667 UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 668 int32_t value = *static_cast<int32_t*>(context); 669 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 670 } 671 672 UBool versionFilter(UChar32 ch, void* context) { 673 static const UVersionInfo none = { 0, 0, 0, 0 }; 674 UVersionInfo v; 675 u_charAge(ch, v); 676 UVersionInfo* version = static_cast<UVersionInfo*>(context); 677 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 678 } 679 680 typedef struct { 681 UProperty prop; 682 int32_t value; 683 } IntPropertyContext; 684 685 UBool intPropertyFilter(UChar32 ch, void* context) { 686 IntPropertyContext* c = static_cast<IntPropertyContext*>(context); 687 return u_getIntPropertyValue(ch, c->prop) == c->value; 688 } 689 690 UBool scriptExtensionsFilter(UChar32 ch, void* context) { 691 return uscript_hasScript(ch, *static_cast<UScriptCode*>(context)); 692 } 693 694 UBool idTypeFilter(UChar32 ch, void* context) { 695 return u_hasIDType(ch, *static_cast<UIdentifierType*>(context)); 696 } 697 698 } // namespace 699 700 /** 701 * Generic filter-based scanning code for UCD property UnicodeSets. 702 */ 703 void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 704 void* context, 705 const UnicodeSet* inclusions, 706 UErrorCode &status) { 707 if (U_FAILURE(status)) return; 708 709 // Logically, walk through all Unicode characters, noting the start 710 // and end of each range for which filter.contain(c) is 711 // true. Add each range to a set. 712 // 713 // To improve performance, use an inclusions set which 714 // encodes information about character ranges that are known 715 // to have identical properties. 716 // inclusions contains the first characters of 717 // same-value ranges for the given property. 718 719 clear(); 720 721 UChar32 startHasProperty = -1; 722 int32_t limitRange = inclusions->getRangeCount(); 723 724 for (int j=0; j<limitRange; ++j) { 725 // get current range 726 UChar32 start = inclusions->getRangeStart(j); 727 UChar32 end = inclusions->getRangeEnd(j); 728 729 // for all the code points in the range, process 730 for (UChar32 ch = start; ch <= end; ++ch) { 731 // only add to this UnicodeSet on inflection points -- 732 // where the hasProperty value changes to false 733 if ((*filter)(ch, context)) { 734 if (startHasProperty < 0) { 735 startHasProperty = ch; 736 } 737 } else if (startHasProperty >= 0) { 738 add(startHasProperty, ch-1); 739 startHasProperty = -1; 740 } 741 } 742 } 743 if (startHasProperty >= 0) { 744 add(startHasProperty, static_cast<UChar32>(0x10FFFF)); 745 } 746 if (isBogus() && U_SUCCESS(status)) { 747 // We likely ran out of memory. AHHH! 748 status = U_MEMORY_ALLOCATION_ERROR; 749 } 750 } 751 752 namespace { 753 754 UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 755 /* Note: we use ' ' in compiler code page */ 756 int32_t j = 0; 757 char ch; 758 --dstCapacity; /* make room for term. zero */ 759 while ((ch = *src++) != 0) { 760 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 761 continue; 762 } 763 if (j >= dstCapacity) return false; 764 dst[j++] = ch; 765 } 766 if (j > 0 && dst[j-1] == ' ') --j; 767 dst[j] = 0; 768 return true; 769 } 770 771 } // namespace 772 773 //---------------------------------------------------------------- 774 // Property set API 775 //---------------------------------------------------------------- 776 777 #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \ 778 ec=U_ILLEGAL_ARGUMENT_ERROR; \ 779 return *this; \ 780 } UPRV_BLOCK_MACRO_END 781 782 UnicodeSet& 783 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 784 if (U_FAILURE(ec) || isFrozen()) { return *this; } 785 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 786 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 787 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); 788 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 789 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 790 UScriptCode script = static_cast<UScriptCode>(value); 791 applyFilter(scriptExtensionsFilter, &script, inclusions, ec); 792 } else if (prop == UCHAR_IDENTIFIER_TYPE) { 793 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 794 UIdentifierType idType = static_cast<UIdentifierType>(value); 795 applyFilter(idTypeFilter, &idType, inclusions, ec); 796 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { 797 if (value == 0 || value == 1) { 798 const USet *set = u_getBinaryPropertySet(prop, &ec); 799 if (U_FAILURE(ec)) { return *this; } 800 copyFrom(*UnicodeSet::fromUSet(set), true); 801 if (value == 0) { 802 complement().removeAllStrings(); // code point complement 803 } 804 } else { 805 clear(); 806 } 807 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { 808 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 809 IntPropertyContext c = {prop, value}; 810 applyFilter(intPropertyFilter, &c, inclusions, ec); 811 } else { 812 ec = U_ILLEGAL_ARGUMENT_ERROR; 813 } 814 return *this; 815 } 816 817 UnicodeSet& 818 UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 819 const UnicodeString& value, 820 UErrorCode& ec) { 821 if (U_FAILURE(ec) || isFrozen()) return *this; 822 823 // prop and value used to be converted to char * using the default 824 // converter instead of the invariant conversion. 825 // This should not be necessary because all Unicode property and value 826 // names use only invariant characters. 827 // If there are any variant characters, then we won't find them anyway. 828 // Checking first avoids assertion failures in the conversion. 829 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 830 !uprv_isInvariantUString(value.getBuffer(), value.length()) 831 ) { 832 FAIL(ec); 833 } 834 CharString pname, vname; 835 pname.appendInvariantChars(prop, ec); 836 vname.appendInvariantChars(value, ec); 837 if (U_FAILURE(ec)) return *this; 838 839 UProperty p; 840 int32_t v; 841 UBool invert = false; 842 843 if (value.length() > 0) { 844 p = u_getPropertyEnum(pname.data()); 845 if (p == UCHAR_INVALID_CODE) FAIL(ec); 846 847 // Treat gc as gcm 848 if (p == UCHAR_GENERAL_CATEGORY) { 849 p = UCHAR_GENERAL_CATEGORY_MASK; 850 } 851 852 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 853 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 854 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 855 v = u_getPropertyValueEnum(p, vname.data()); 856 if (v == UCHAR_INVALID_CODE) { 857 // Handle numeric CCC 858 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 859 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 860 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 861 char* end; 862 double val = uprv_strtod(vname.data(), &end); 863 // Anything between 0 and 255 is valid even if unused. 864 // Cast double->int only after range check. 865 // We catch NaN here because comparing it with both 0 and 255 will be false 866 // (as are all comparisons with NaN). 867 if (*end != 0 || !(0 <= val && val <= 255) || 868 (v = static_cast<int32_t>(val)) != val) { 869 // non-integral value or outside 0..255, or trailing junk 870 FAIL(ec); 871 } 872 } else { 873 FAIL(ec); 874 } 875 } 876 } 877 878 else { 879 880 switch (p) { 881 case UCHAR_NUMERIC_VALUE: 882 { 883 char* end; 884 double val = uprv_strtod(vname.data(), &end); 885 if (*end != 0) { 886 FAIL(ec); 887 } 888 applyFilter(numericValueFilter, &val, 889 CharacterProperties::getInclusionsForProperty(p, ec), ec); 890 return *this; 891 } 892 case UCHAR_NAME: 893 { 894 // Must munge name, since u_charFromName() does not do 895 // 'loose' matching. 896 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 897 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 898 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); 899 if (U_SUCCESS(ec)) { 900 clear(); 901 add(ch); 902 return *this; 903 } else { 904 FAIL(ec); 905 } 906 } 907 case UCHAR_UNICODE_1_NAME: 908 // ICU 49 deprecates the Unicode_1_Name property APIs. 909 FAIL(ec); 910 case UCHAR_AGE: 911 { 912 // Must munge name, since u_versionFromString() does not do 913 // 'loose' matching. 914 char buf[128]; 915 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 916 UVersionInfo version; 917 u_versionFromString(version, buf); 918 applyFilter(versionFilter, &version, 919 CharacterProperties::getInclusionsForProperty(p, ec), ec); 920 return *this; 921 } 922 case UCHAR_SCRIPT_EXTENSIONS: 923 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 924 if (v == UCHAR_INVALID_CODE) { 925 FAIL(ec); 926 } 927 // fall through to calling applyIntPropertyValue() 928 break; 929 case UCHAR_IDENTIFIER_TYPE: 930 v = u_getPropertyValueEnum(p, vname.data()); 931 if (v == UCHAR_INVALID_CODE) { 932 FAIL(ec); 933 } 934 // fall through to calling applyIntPropertyValue() 935 break; 936 default: 937 // p is a non-binary, non-enumerated property that we 938 // don't support (yet). 939 FAIL(ec); 940 } 941 } 942 } 943 944 else { 945 // value is empty. Interpret as General Category, Script, or 946 // Binary property. 947 p = UCHAR_GENERAL_CATEGORY_MASK; 948 v = u_getPropertyValueEnum(p, pname.data()); 949 if (v == UCHAR_INVALID_CODE) { 950 p = UCHAR_SCRIPT; 951 v = u_getPropertyValueEnum(p, pname.data()); 952 if (v == UCHAR_INVALID_CODE) { 953 p = u_getPropertyEnum(pname.data()); 954 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 955 v = 1; 956 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 957 set(MIN_VALUE, MAX_VALUE); 958 return *this; 959 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 960 set(0, 0x7F); 961 return *this; 962 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 963 // [:Assigned:]=[:^Cn:] 964 p = UCHAR_GENERAL_CATEGORY_MASK; 965 v = U_GC_CN_MASK; 966 invert = true; 967 } else { 968 FAIL(ec); 969 } 970 } 971 } 972 } 973 974 applyIntPropertyValue(p, v, ec); 975 if(invert) { 976 complement().removeAllStrings(); // code point complement 977 } 978 979 if (isBogus() && U_SUCCESS(ec)) { 980 // We likely ran out of memory. AHHH! 981 ec = U_MEMORY_ALLOCATION_ERROR; 982 } 983 return *this; 984 } 985 986 //---------------------------------------------------------------- 987 // Property set patterns 988 //---------------------------------------------------------------- 989 990 /** 991 * Return true if the given position, in the given pattern, appears 992 * to be the start of a property set pattern. 993 */ 994 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 995 int32_t pos) { 996 // Patterns are at least 5 characters long 997 if ((pos+5) > pattern.length()) { 998 return false; 999 } 1000 1001 // Look for an opening [:, [:^, \p, or \P 1002 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1003 } 1004 1005 /** 1006 * Return true if the given iterator appears to point at a 1007 * property pattern. Regardless of the result, return with the 1008 * iterator unchanged. 1009 * @param chars iterator over the pattern characters. Upon return 1010 * it will be unchanged. 1011 * @param iterOpts RuleCharacterIterator options 1012 */ 1013 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1014 int32_t iterOpts) { 1015 // NOTE: literal will always be false, because we don't parse escapes. 1016 UBool result = false, literal; 1017 UErrorCode ec = U_ZERO_ERROR; 1018 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1019 RuleCharacterIterator::Pos pos; 1020 chars.getPos(pos); 1021 UChar32 c = chars.next(iterOpts, literal, ec); 1022 if (c == u'[' || c == u'\\') { 1023 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1024 literal, ec); 1025 result = (c == u'[') ? (d == u':') : 1026 (d == u'N' || d == u'p' || d == u'P'); 1027 } 1028 chars.setPos(pos); 1029 return result && U_SUCCESS(ec); 1030 } 1031 1032 /** 1033 * Parse the given property pattern at the given parse position. 1034 */ 1035 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1036 ParsePosition& ppos, 1037 UErrorCode &ec) { 1038 int32_t pos = ppos.getIndex(); 1039 1040 UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1041 UBool isName = false; // true for \N{pat}, o/w false 1042 UBool invert = false; 1043 1044 if (U_FAILURE(ec)) return *this; 1045 1046 // Minimum length is 5 characters, e.g. \p{L} 1047 if ((pos+5) > pattern.length()) { 1048 FAIL(ec); 1049 } 1050 1051 // On entry, ppos should point to one of the following locations: 1052 // Look for an opening [:, [:^, \p, or \P 1053 if (isPOSIXOpen(pattern, pos)) { 1054 posix = true; 1055 pos += 2; 1056 pos = ICU_Utility::skipWhitespace(pattern, pos); 1057 if (pos < pattern.length() && pattern.charAt(pos) == u'^') { 1058 ++pos; 1059 invert = true; 1060 } 1061 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1062 char16_t c = pattern.charAt(pos+1); 1063 invert = (c == u'P'); 1064 isName = (c == u'N'); 1065 pos += 2; 1066 pos = ICU_Utility::skipWhitespace(pattern, pos); 1067 if (pos == pattern.length() || pattern.charAt(pos++) != u'{') { 1068 // Syntax error; "\p" or "\P" not followed by "{" 1069 FAIL(ec); 1070 } 1071 } else { 1072 // Open delimiter not seen 1073 FAIL(ec); 1074 } 1075 1076 // Look for the matching close delimiter, either :] or } 1077 int32_t close; 1078 if (posix) { 1079 close = pattern.indexOf(u":]", 2, pos); 1080 } else { 1081 close = pattern.indexOf(u'}', pos); 1082 } 1083 if (close < 0) { 1084 // Syntax error; close delimiter missing 1085 FAIL(ec); 1086 } 1087 1088 // Look for an '=' sign. If this is present, we will parse a 1089 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1090 // pattern. 1091 int32_t equals = pattern.indexOf(u'=', pos); 1092 UnicodeString propName, valueName; 1093 if (equals >= 0 && equals < close && !isName) { 1094 // Equals seen; parse medium/long pattern 1095 pattern.extractBetween(pos, equals, propName); 1096 pattern.extractBetween(equals+1, close, valueName); 1097 } 1098 1099 else { 1100 // Handle case where no '=' is seen, and \N{} 1101 pattern.extractBetween(pos, close, propName); 1102 1103 // Handle \N{name} 1104 if (isName) { 1105 // This is a little inefficient since it means we have to 1106 // parse NAME_PROP back to UCHAR_NAME even though we already 1107 // know it's UCHAR_NAME. If we refactor the API to 1108 // support args of (UProperty, char*) then we can remove 1109 // NAME_PROP and make this a little more efficient. 1110 valueName = propName; 1111 propName = NAME_PROP; 1112 } 1113 } 1114 1115 applyPropertyAlias(propName, valueName, ec); 1116 1117 if (U_SUCCESS(ec)) { 1118 if (invert) { 1119 complement().removeAllStrings(); // code point complement 1120 } 1121 1122 // Move to the limit position after the close delimiter if the 1123 // parse succeeded. 1124 ppos.setIndex(close + (posix ? 2 : 1)); 1125 } 1126 1127 return *this; 1128 } 1129 1130 /** 1131 * Parse a property pattern. 1132 * @param chars iterator over the pattern characters. Upon return 1133 * it will be advanced to the first character after the parsed 1134 * pattern, or the end of the iteration if all characters are 1135 * parsed. 1136 * @param rebuiltPat the pattern that was parsed, rebuilt or 1137 * copied from the input pattern, as appropriate. 1138 */ 1139 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1140 UnicodeString& rebuiltPat, 1141 UErrorCode& ec) { 1142 if (U_FAILURE(ec)) return; 1143 UnicodeString pattern; 1144 chars.lookahead(pattern); 1145 ParsePosition pos(0); 1146 applyPropertyPattern(pattern, pos, ec); 1147 if (U_FAILURE(ec)) return; 1148 if (pos.getIndex() == 0) { 1149 // syntaxError(chars, "Invalid property pattern"); 1150 ec = U_MALFORMED_SET; 1151 return; 1152 } 1153 chars.jumpahead(pos.getIndex()); 1154 rebuiltPat.append(pattern, 0, pos.getIndex()); 1155 } 1156 1157 U_NAMESPACE_END