collationruleparser.cpp (31681B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationruleparser.cpp 9 * 10 * (replaced the former ucol_tok.cpp) 11 * 12 * created on: 2013apr10 13 * created by: Markus W. Scherer 14 */ 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_COLLATION 19 20 #include "unicode/normalizer2.h" 21 #include "unicode/parseerr.h" 22 #include "unicode/uchar.h" 23 #include "unicode/ucol.h" 24 #include "unicode/uloc.h" 25 #include "unicode/unistr.h" 26 #include "unicode/utf16.h" 27 #include "charstr.h" 28 #include "cmemory.h" 29 #include "collation.h" 30 #include "collationdata.h" 31 #include "collationruleparser.h" 32 #include "collationsettings.h" 33 #include "collationtailoring.h" 34 #include "cstring.h" 35 #include "patternprops.h" 36 #include "uassert.h" 37 #include "ulocimp.h" 38 #include "uvectr32.h" 39 40 U_NAMESPACE_BEGIN 41 42 namespace { 43 44 const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before" 45 const int32_t BEFORE_LENGTH = 7; 46 47 } // namespace 48 49 CollationRuleParser::Sink::~Sink() {} 50 51 void 52 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {} 53 54 void 55 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {} 56 57 CollationRuleParser::Importer::~Importer() {} 58 59 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode) 60 : nfd(*Normalizer2::getNFDInstance(errorCode)), 61 nfc(*Normalizer2::getNFCInstance(errorCode)), 62 rules(nullptr), baseData(base), settings(nullptr), 63 parseError(nullptr), errorReason(nullptr), 64 sink(nullptr), importer(nullptr), 65 ruleIndex(0) { 66 } 67 68 CollationRuleParser::~CollationRuleParser() { 69 } 70 71 void 72 CollationRuleParser::parse(const UnicodeString &ruleString, 73 CollationSettings &outSettings, 74 UParseError *outParseError, 75 UErrorCode &errorCode) { 76 if(U_FAILURE(errorCode)) { return; } 77 settings = &outSettings; 78 parseError = outParseError; 79 if(parseError != nullptr) { 80 parseError->line = 0; 81 parseError->offset = -1; 82 parseError->preContext[0] = 0; 83 parseError->postContext[0] = 0; 84 } 85 errorReason = nullptr; 86 parse(ruleString, errorCode); 87 } 88 89 void 90 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) { 91 if(U_FAILURE(errorCode)) { return; } 92 rules = &ruleString; 93 ruleIndex = 0; 94 95 while(ruleIndex < rules->length()) { 96 char16_t c = rules->charAt(ruleIndex); 97 if(PatternProps::isWhiteSpace(c)) { 98 ++ruleIndex; 99 continue; 100 } 101 switch(c) { 102 case 0x26: // '&' 103 parseRuleChain(errorCode); 104 break; 105 case 0x5b: // '[' 106 parseSetting(errorCode); 107 break; 108 case 0x23: // '#' starts a comment, until the end of the line 109 ruleIndex = skipComment(ruleIndex + 1); 110 break; 111 case 0x40: // '@' is equivalent to [backwards 2] 112 settings->setFlag(CollationSettings::BACKWARD_SECONDARY, 113 UCOL_ON, 0, errorCode); 114 ++ruleIndex; 115 break; 116 case 0x21: // '!' used to turn on Thai/Lao character reversal 117 // Accept but ignore. The root collator has contractions 118 // that are equivalent to the character reversal, where appropriate. 119 ++ruleIndex; 120 break; 121 default: 122 setParseError("expected a reset or setting or comment", errorCode); 123 break; 124 } 125 if(U_FAILURE(errorCode)) { return; } 126 } 127 } 128 129 void 130 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) { 131 int32_t resetStrength = parseResetAndPosition(errorCode); 132 UBool isFirstRelation = true; 133 for(;;) { 134 int32_t result = parseRelationOperator(errorCode); 135 if(U_FAILURE(errorCode)) { return; } 136 if(result < 0) { 137 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) { 138 // '#' starts a comment, until the end of the line 139 ruleIndex = skipComment(ruleIndex + 1); 140 continue; 141 } 142 if(isFirstRelation) { 143 setParseError("reset not followed by a relation", errorCode); 144 } 145 return; 146 } 147 int32_t strength = result & STRENGTH_MASK; 148 if(resetStrength < UCOL_IDENTICAL) { 149 // reset-before rule chain 150 if(isFirstRelation) { 151 if(strength != resetStrength) { 152 setParseError("reset-before strength differs from its first relation", errorCode); 153 return; 154 } 155 } else { 156 if(strength < resetStrength) { 157 setParseError("reset-before strength followed by a stronger relation", errorCode); 158 return; 159 } 160 } 161 } 162 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator 163 if((result & STARRED_FLAG) == 0) { 164 parseRelationStrings(strength, i, errorCode); 165 } else { 166 parseStarredCharacters(strength, i, errorCode); 167 } 168 if(U_FAILURE(errorCode)) { return; } 169 isFirstRelation = false; 170 } 171 } 172 173 int32_t 174 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) { 175 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 176 int32_t i = skipWhiteSpace(ruleIndex + 1); 177 int32_t j; 178 char16_t c; 179 int32_t resetStrength; 180 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 && 181 (j = i + BEFORE_LENGTH) < rules->length() && 182 PatternProps::isWhiteSpace(rules->charAt(j)) && 183 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() && 184 0x31 <= (c = rules->charAt(j)) && c <= 0x33 && 185 rules->charAt(j + 1) == 0x5d) { 186 // &[before n] with n=1 or 2 or 3 187 resetStrength = UCOL_PRIMARY + (c - 0x31); 188 i = skipWhiteSpace(j + 2); 189 } else { 190 resetStrength = UCOL_IDENTICAL; 191 } 192 if(i >= rules->length()) { 193 setParseError("reset without position", errorCode); 194 return UCOL_DEFAULT; 195 } 196 UnicodeString str; 197 if(rules->charAt(i) == 0x5b) { // '[' 198 i = parseSpecialPosition(i, str, errorCode); 199 } else { 200 i = parseTailoringString(i, str, errorCode); 201 } 202 sink->addReset(resetStrength, str, errorReason, errorCode); 203 if(U_FAILURE(errorCode)) { setErrorContext(); } 204 ruleIndex = i; 205 return resetStrength; 206 } 207 208 int32_t 209 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) { 210 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 211 ruleIndex = skipWhiteSpace(ruleIndex); 212 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; } 213 int32_t strength; 214 int32_t i = ruleIndex; 215 char16_t c = rules->charAt(i++); 216 switch(c) { 217 case 0x3c: // '<' 218 if(i < rules->length() && rules->charAt(i) == 0x3c) { // << 219 ++i; 220 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<< 221 ++i; 222 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<< 223 ++i; 224 strength = UCOL_QUATERNARY; 225 } else { 226 strength = UCOL_TERTIARY; 227 } 228 } else { 229 strength = UCOL_SECONDARY; 230 } 231 } else { 232 strength = UCOL_PRIMARY; 233 } 234 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' 235 ++i; 236 strength |= STARRED_FLAG; 237 } 238 break; 239 case 0x3b: // ';' same as << 240 strength = UCOL_SECONDARY; 241 break; 242 case 0x2c: // ',' same as <<< 243 strength = UCOL_TERTIARY; 244 break; 245 case 0x3d: // '=' 246 strength = UCOL_IDENTICAL; 247 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*' 248 ++i; 249 strength |= STARRED_FLAG; 250 } 251 break; 252 default: 253 return UCOL_DEFAULT; 254 } 255 return ((i - ruleIndex) << OFFSET_SHIFT) | strength; 256 } 257 258 void 259 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) { 260 // Parse 261 // prefix | str / extension 262 // where prefix and extension are optional. 263 UnicodeString prefix, str, extension; 264 i = parseTailoringString(i, str, errorCode); 265 if(U_FAILURE(errorCode)) { return; } 266 char16_t next = (i < rules->length()) ? rules->charAt(i) : 0; 267 if(next == 0x7c) { // '|' separates the context prefix from the string. 268 prefix = str; 269 i = parseTailoringString(i + 1, str, errorCode); 270 if(U_FAILURE(errorCode)) { return; } 271 next = (i < rules->length()) ? rules->charAt(i) : 0; 272 } 273 if(next == 0x2f) { // '/' separates the string from the extension. 274 i = parseTailoringString(i + 1, extension, errorCode); 275 } 276 if(!prefix.isEmpty()) { 277 UChar32 prefix0 = prefix.char32At(0); 278 UChar32 c = str.char32At(0); 279 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { 280 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary", 281 errorCode); 282 return; 283 } 284 } 285 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode); 286 if(U_FAILURE(errorCode)) { setErrorContext(); } 287 ruleIndex = i; 288 } 289 290 void 291 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) { 292 UnicodeString empty, raw; 293 i = parseString(skipWhiteSpace(i), raw, errorCode); 294 if(U_FAILURE(errorCode)) { return; } 295 if(raw.isEmpty()) { 296 setParseError("missing starred-relation string", errorCode); 297 return; 298 } 299 UChar32 prev = -1; 300 int32_t j = 0; 301 for(;;) { 302 while(j < raw.length()) { 303 UChar32 c = raw.char32At(j); 304 if(!nfd.isInert(c)) { 305 setParseError("starred-relation string is not all NFD-inert", errorCode); 306 return; 307 } 308 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode); 309 if(U_FAILURE(errorCode)) { 310 setErrorContext(); 311 return; 312 } 313 j += U16_LENGTH(c); 314 prev = c; 315 } 316 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-' 317 break; 318 } 319 if(prev < 0) { 320 setParseError("range without start in starred-relation string", errorCode); 321 return; 322 } 323 i = parseString(i + 1, raw, errorCode); 324 if(U_FAILURE(errorCode)) { return; } 325 if(raw.isEmpty()) { 326 setParseError("range without end in starred-relation string", errorCode); 327 return; 328 } 329 UChar32 c = raw.char32At(0); 330 if(c < prev) { 331 setParseError("range start greater than end in starred-relation string", errorCode); 332 return; 333 } 334 // range prev-c 335 UnicodeString s; 336 while(++prev <= c) { 337 if(!nfd.isInert(prev)) { 338 setParseError("starred-relation string range is not all NFD-inert", errorCode); 339 return; 340 } 341 if(U_IS_SURROGATE(prev)) { 342 setParseError("starred-relation string range contains a surrogate", errorCode); 343 return; 344 } 345 if(0xfffd <= prev && prev <= 0xffff) { 346 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode); 347 return; 348 } 349 s.setTo(prev); 350 sink->addRelation(strength, empty, s, empty, errorReason, errorCode); 351 if(U_FAILURE(errorCode)) { 352 setErrorContext(); 353 return; 354 } 355 } 356 prev = -1; 357 j = U16_LENGTH(c); 358 } 359 ruleIndex = skipWhiteSpace(i); 360 } 361 362 int32_t 363 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { 364 i = parseString(skipWhiteSpace(i), raw, errorCode); 365 if(U_SUCCESS(errorCode) && raw.isEmpty()) { 366 setParseError("missing relation string", errorCode); 367 } 368 return skipWhiteSpace(i); 369 } 370 371 int32_t 372 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) { 373 if(U_FAILURE(errorCode)) { return i; } 374 raw.remove(); 375 while(i < rules->length()) { 376 UChar32 c = rules->charAt(i++); 377 if(isSyntaxChar(c)) { 378 if(c == 0x27) { // apostrophe 379 if(i < rules->length() && rules->charAt(i) == 0x27) { 380 // Double apostrophe, encodes a single one. 381 raw.append(static_cast<char16_t>(0x27)); 382 ++i; 383 continue; 384 } 385 // Quote literal text until the next single apostrophe. 386 for(;;) { 387 if(i == rules->length()) { 388 setParseError("quoted literal text missing terminating apostrophe", errorCode); 389 return i; 390 } 391 c = rules->charAt(i++); 392 if(c == 0x27) { 393 if(i < rules->length() && rules->charAt(i) == 0x27) { 394 // Double apostrophe inside quoted literal text, 395 // still encodes a single apostrophe. 396 ++i; 397 } else { 398 break; 399 } 400 } 401 raw.append(static_cast<char16_t>(c)); 402 } 403 } else if(c == 0x5c) { // backslash 404 if(i == rules->length()) { 405 setParseError("backslash escape at the end of the rule string", errorCode); 406 return i; 407 } 408 c = rules->char32At(i); 409 raw.append(c); 410 i += U16_LENGTH(c); 411 } else { 412 // Any other syntax character terminates a string. 413 --i; 414 break; 415 } 416 } else if(PatternProps::isWhiteSpace(c)) { 417 // Unquoted white space terminates a string. 418 --i; 419 break; 420 } else { 421 raw.append(static_cast<char16_t>(c)); 422 } 423 } 424 for(int32_t j = 0; j < raw.length();) { 425 UChar32 c = raw.char32At(j); 426 if(U_IS_SURROGATE(c)) { 427 setParseError("string contains an unpaired surrogate", errorCode); 428 return i; 429 } 430 if(0xfffd <= c && c <= 0xffff) { 431 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode); 432 return i; 433 } 434 j += U16_LENGTH(c); 435 } 436 return i; 437 } 438 439 namespace { 440 441 const char* const positions[] = { 442 "first tertiary ignorable", 443 "last tertiary ignorable", 444 "first secondary ignorable", 445 "last secondary ignorable", 446 "first primary ignorable", 447 "last primary ignorable", 448 "first variable", 449 "last variable", 450 "first regular", 451 "last regular", 452 "first implicit", 453 "last implicit", 454 "first trailing", 455 "last trailing" 456 }; 457 458 } // namespace 459 460 int32_t 461 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) { 462 if(U_FAILURE(errorCode)) { return 0; } 463 UnicodeString raw; 464 int32_t j = readWords(i + 1, raw); 465 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ] 466 ++j; 467 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) { 468 if(raw == UnicodeString(positions[pos], -1, US_INV)) { 469 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + pos)); 470 return j; 471 } 472 } 473 if(raw == UNICODE_STRING_SIMPLE("top")) { 474 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_REGULAR)); 475 return j; 476 } 477 if(raw == UNICODE_STRING_SIMPLE("variable top")) { 478 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_VARIABLE)); 479 return j; 480 } 481 } 482 setParseError("not a valid special reset position", errorCode); 483 return i; 484 } 485 486 void 487 CollationRuleParser::parseSetting(UErrorCode &errorCode) { 488 if(U_FAILURE(errorCode)) { return; } 489 UnicodeString raw; 490 int32_t i = ruleIndex + 1; 491 int32_t j = readWords(i, raw); 492 if(j <= i || raw.isEmpty()) { 493 setParseError("expected a setting/option at '['", errorCode); 494 } 495 if(rules->charAt(j) == 0x5d) { // words end with ] 496 ++j; 497 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) && 498 (raw.length() == 7 || raw.charAt(7) == 0x20)) { 499 parseReordering(raw, errorCode); 500 ruleIndex = j; 501 return; 502 } 503 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) { 504 settings->setFlag(CollationSettings::BACKWARD_SECONDARY, 505 UCOL_ON, 0, errorCode); 506 ruleIndex = j; 507 return; 508 } 509 UnicodeString v; 510 int32_t valueIndex = raw.lastIndexOf(static_cast<char16_t>(0x20)); 511 if(valueIndex >= 0) { 512 v.setTo(raw, valueIndex + 1); 513 raw.truncate(valueIndex); 514 } 515 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) { 516 int32_t value = UCOL_DEFAULT; 517 char16_t c = v.charAt(0); 518 if(0x31 <= c && c <= 0x34) { // 1..4 519 value = UCOL_PRIMARY + (c - 0x31); 520 } else if(c == 0x49) { // 'I' 521 value = UCOL_IDENTICAL; 522 } 523 if(value != UCOL_DEFAULT) { 524 settings->setStrength(value, 0, errorCode); 525 ruleIndex = j; 526 return; 527 } 528 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) { 529 UColAttributeValue value = UCOL_DEFAULT; 530 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) { 531 value = UCOL_NON_IGNORABLE; 532 } else if(v == UNICODE_STRING_SIMPLE("shifted")) { 533 value = UCOL_SHIFTED; 534 } 535 if(value != UCOL_DEFAULT) { 536 settings->setAlternateHandling(value, 0, errorCode); 537 ruleIndex = j; 538 return; 539 } 540 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) { 541 int32_t value = UCOL_DEFAULT; 542 if(v == UNICODE_STRING_SIMPLE("space")) { 543 value = CollationSettings::MAX_VAR_SPACE; 544 } else if(v == UNICODE_STRING_SIMPLE("punct")) { 545 value = CollationSettings::MAX_VAR_PUNCT; 546 } else if(v == UNICODE_STRING_SIMPLE("symbol")) { 547 value = CollationSettings::MAX_VAR_SYMBOL; 548 } else if(v == UNICODE_STRING_SIMPLE("currency")) { 549 value = CollationSettings::MAX_VAR_CURRENCY; 550 } 551 if(value != UCOL_DEFAULT) { 552 settings->setMaxVariable(value, 0, errorCode); 553 settings->variableTop = baseData->getLastPrimaryForGroup( 554 UCOL_REORDER_CODE_FIRST + value); 555 U_ASSERT(settings->variableTop != 0); 556 ruleIndex = j; 557 return; 558 } 559 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) { 560 UColAttributeValue value = UCOL_DEFAULT; 561 if(v == UNICODE_STRING_SIMPLE("off")) { 562 value = UCOL_OFF; 563 } else if(v == UNICODE_STRING_SIMPLE("lower")) { 564 value = UCOL_LOWER_FIRST; 565 } else if(v == UNICODE_STRING_SIMPLE("upper")) { 566 value = UCOL_UPPER_FIRST; 567 } 568 if(value != UCOL_DEFAULT) { 569 settings->setCaseFirst(value, 0, errorCode); 570 ruleIndex = j; 571 return; 572 } 573 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) { 574 UColAttributeValue value = getOnOffValue(v); 575 if(value != UCOL_DEFAULT) { 576 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode); 577 ruleIndex = j; 578 return; 579 } 580 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) { 581 UColAttributeValue value = getOnOffValue(v); 582 if(value != UCOL_DEFAULT) { 583 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode); 584 ruleIndex = j; 585 return; 586 } 587 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) { 588 UColAttributeValue value = getOnOffValue(v); 589 if(value != UCOL_DEFAULT) { 590 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode); 591 ruleIndex = j; 592 return; 593 } 594 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) { 595 UColAttributeValue value = getOnOffValue(v); 596 if(value != UCOL_DEFAULT) { 597 if(value == UCOL_ON) { 598 setParseError("[hiraganaQ on] is not supported", errorCode); 599 } 600 ruleIndex = j; 601 return; 602 } 603 } else if(raw == UNICODE_STRING_SIMPLE("import")) { 604 CharString lang; 605 lang.appendInvariantChars(v, errorCode); 606 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; } 607 // BCP 47 language tag -> ICU locale ID 608 int32_t parsedLength; 609 CharString localeID = ulocimp_forLanguageTag(lang.data(), -1, &parsedLength, errorCode); 610 if(U_FAILURE(errorCode) || parsedLength != lang.length()) { 611 errorCode = U_ZERO_ERROR; 612 setParseError("expected language tag in [import langTag]", errorCode); 613 return; 614 } 615 // localeID minus all keywords 616 CharString baseID = ulocimp_getBaseName(localeID.toStringPiece(), errorCode); 617 if (U_FAILURE(errorCode)) { 618 errorCode = U_ZERO_ERROR; 619 setParseError("expected language tag in [import langTag]", errorCode); 620 return; 621 } 622 if (baseID.isEmpty()) { 623 baseID.copyFrom("root", errorCode); 624 } else if (baseID[0] == '_') { 625 // CharString doesn't have any insert() method, only append(). 626 constexpr char und[] = "und"; 627 constexpr int32_t length = sizeof und - 1; 628 int32_t dummy; 629 char* tail = baseID.getAppendBuffer(length, length, dummy, errorCode); 630 char* head = baseID.data(); 631 uprv_memmove(head + length, head, baseID.length()); 632 uprv_memcpy(head, und, length); 633 baseID.append(tail, length, errorCode); 634 } 635 // @collation=type, or length=0 if not specified 636 CharString collationType = ulocimp_getKeywordValue(localeID.data(), "collation", errorCode); 637 if(U_FAILURE(errorCode)) { 638 errorCode = U_ZERO_ERROR; 639 setParseError("expected language tag in [import langTag]", errorCode); 640 return; 641 } 642 if(importer == nullptr) { 643 setParseError("[import langTag] is not supported", errorCode); 644 } else { 645 UnicodeString importedRules; 646 importer->getRules(baseID.data(), 647 !collationType.isEmpty() ? collationType.data() : "standard", 648 importedRules, errorReason, errorCode); 649 if(U_FAILURE(errorCode)) { 650 if(errorReason == nullptr) { 651 errorReason = "[import langTag] failed"; 652 } 653 setErrorContext(); 654 return; 655 } 656 const UnicodeString *outerRules = rules; 657 int32_t outerRuleIndex = ruleIndex; 658 parse(importedRules, errorCode); 659 if(U_FAILURE(errorCode)) { 660 if(parseError != nullptr) { 661 parseError->offset = outerRuleIndex; 662 } 663 } 664 rules = outerRules; 665 ruleIndex = j; 666 } 667 return; 668 } 669 } else if(rules->charAt(j) == 0x5b) { // words end with [ 670 UnicodeSet set; 671 j = parseUnicodeSet(j, set, errorCode); 672 if(U_FAILURE(errorCode)) { return; } 673 if(raw == UNICODE_STRING_SIMPLE("optimize")) { 674 sink->optimize(set, errorReason, errorCode); 675 if(U_FAILURE(errorCode)) { setErrorContext(); } 676 ruleIndex = j; 677 return; 678 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) { 679 sink->suppressContractions(set, errorReason, errorCode); 680 if(U_FAILURE(errorCode)) { setErrorContext(); } 681 ruleIndex = j; 682 return; 683 } 684 } 685 setParseError("not a valid setting/option", errorCode); 686 } 687 688 void 689 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) { 690 if(U_FAILURE(errorCode)) { return; } 691 int32_t i = 7; // after "reorder" 692 if(i == raw.length()) { 693 // empty [reorder] with no codes 694 settings->resetReordering(); 695 return; 696 } 697 // Parse the codes in [reorder aa bb cc]. 698 UVector32 reorderCodes(errorCode); 699 if(U_FAILURE(errorCode)) { return; } 700 CharString word; 701 while(i < raw.length()) { 702 ++i; // skip the word-separating space 703 int32_t limit = raw.indexOf(static_cast<char16_t>(0x20), i); 704 if(limit < 0) { limit = raw.length(); } 705 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode); 706 if(U_FAILURE(errorCode)) { return; } 707 int32_t code = getReorderCode(word.data()); 708 if(code < 0) { 709 setParseError("unknown script or reorder code", errorCode); 710 return; 711 } 712 reorderCodes.addElement(code, errorCode); 713 if(U_FAILURE(errorCode)) { return; } 714 i = limit; 715 } 716 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode); 717 } 718 719 static const char *const gSpecialReorderCodes[] = { 720 "space", "punct", "symbol", "currency", "digit" 721 }; 722 723 int32_t 724 CollationRuleParser::getReorderCode(const char *word) { 725 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) { 726 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) { 727 return UCOL_REORDER_CODE_FIRST + i; 728 } 729 } 730 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); 731 if(script >= 0) { 732 return script; 733 } 734 if(uprv_stricmp(word, "others") == 0) { 735 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN 736 } 737 return -1; 738 } 739 740 UColAttributeValue 741 CollationRuleParser::getOnOffValue(const UnicodeString &s) { 742 if(s == UNICODE_STRING_SIMPLE("on")) { 743 return UCOL_ON; 744 } else if(s == UNICODE_STRING_SIMPLE("off")) { 745 return UCOL_OFF; 746 } else { 747 return UCOL_DEFAULT; 748 } 749 } 750 751 int32_t 752 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) { 753 // Collect a UnicodeSet pattern between a balanced pair of [brackets]. 754 int32_t level = 0; 755 int32_t j = i; 756 for(;;) { 757 if(j == rules->length()) { 758 setParseError("unbalanced UnicodeSet pattern brackets", errorCode); 759 return j; 760 } 761 char16_t c = rules->charAt(j++); 762 if(c == 0x5b) { // '[' 763 ++level; 764 } else if(c == 0x5d) { // ']' 765 if(--level == 0) { break; } 766 } 767 } 768 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode); 769 if(U_FAILURE(errorCode)) { 770 errorCode = U_ZERO_ERROR; 771 setParseError("not a valid UnicodeSet pattern", errorCode); 772 return j; 773 } 774 j = skipWhiteSpace(j); 775 if(j == rules->length() || rules->charAt(j) != 0x5d) { 776 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode); 777 return j; 778 } 779 return ++j; 780 } 781 782 int32_t 783 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const { 784 static const char16_t sp = 0x20; 785 raw.remove(); 786 i = skipWhiteSpace(i); 787 for(;;) { 788 if(i >= rules->length()) { return 0; } 789 char16_t c = rules->charAt(i); 790 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ 791 if(raw.isEmpty()) { return i; } 792 if(raw.endsWith(&sp, 1)) { // remove trailing space 793 raw.truncate(raw.length() - 1); 794 } 795 return i; 796 } 797 if(PatternProps::isWhiteSpace(c)) { 798 raw.append(sp); 799 i = skipWhiteSpace(i + 1); 800 } else { 801 raw.append(c); 802 ++i; 803 } 804 } 805 } 806 807 int32_t 808 CollationRuleParser::skipComment(int32_t i) const { 809 // skip to past the newline 810 while(i < rules->length()) { 811 char16_t c = rules->charAt(i++); 812 // LF or FF or CR or NEL or LS or PS 813 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { 814 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." 815 // NLF (new line function) = CR or LF or CR+LF or NEL. 816 // No need to collect all of CR+LF because a following LF will be ignored anyway. 817 break; 818 } 819 } 820 return i; 821 } 822 823 void 824 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) { 825 if(U_FAILURE(errorCode)) { return; } 826 // Error code consistent with the old parser (from ca. 2001), 827 // rather than U_PARSE_ERROR; 828 errorCode = U_INVALID_FORMAT_ERROR; 829 errorReason = reason; 830 if(parseError != nullptr) { setErrorContext(); } 831 } 832 833 void 834 CollationRuleParser::setErrorContext() { 835 if(parseError == nullptr) { return; } 836 837 // Note: This relies on the calling code maintaining the ruleIndex 838 // at a position that is useful for debugging. 839 // For example, at the beginning of a reset or relation etc. 840 parseError->offset = ruleIndex; 841 parseError->line = 0; // We are not counting line numbers. 842 843 // before ruleIndex 844 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); 845 if(start < 0) { 846 start = 0; 847 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) { 848 ++start; 849 } 850 int32_t length = ruleIndex - start; 851 rules->extract(start, length, parseError->preContext); 852 parseError->preContext[length] = 0; 853 854 // starting from ruleIndex 855 length = rules->length() - ruleIndex; 856 if(length >= U_PARSE_CONTEXT_LEN) { 857 length = U_PARSE_CONTEXT_LEN - 1; 858 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) { 859 --length; 860 } 861 } 862 rules->extract(ruleIndex, length, parseError->postContext); 863 parseError->postContext[length] = 0; 864 } 865 866 UBool 867 CollationRuleParser::isSyntaxChar(UChar32 c) { 868 return 0x21 <= c && c <= 0x7e && 869 (c <= 0x2f || (0x3a <= c && c <= 0x40) || 870 (0x5b <= c && c <= 0x60) || (0x7b <= c)); 871 } 872 873 int32_t 874 CollationRuleParser::skipWhiteSpace(int32_t i) const { 875 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) { 876 ++i; 877 } 878 return i; 879 } 880 881 U_NAMESPACE_END 882 883 #endif // !UCONFIG_NO_COLLATION