tridpars.cpp (30445B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2014, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 01/14/2002 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "tridpars.h" 18 #include "hash.h" 19 #include "mutex.h" 20 #include "transreg.h" 21 #include "uassert.h" 22 #include "ucln_in.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/translit.h" 25 #include "unicode/uchar.h" 26 #include "unicode/uniset.h" 27 #include "unicode/unistr.h" 28 #include "unicode/utrans.h" 29 #include "util.h" 30 #include "uvector.h" 31 32 U_NAMESPACE_BEGIN 33 34 static const char16_t ID_DELIM = 0x003B; // ; 35 static const char16_t TARGET_SEP = 0x002D; // - 36 static const char16_t VARIANT_SEP = 0x002F; // / 37 static const char16_t OPEN_REV = 0x0028; // ( 38 static const char16_t CLOSE_REV = 0x0029; // ) 39 40 //static const char16_t EMPTY[] = {0}; // "" 41 static const char16_t ANY[] = {65,110,121,0}; // "Any" 42 static const char16_t ANY_NULL[] = {65,110,121,45,78,117,108,108,0}; // "Any-Null" 43 44 static const int32_t FORWARD = UTRANS_FORWARD; 45 static const int32_t REVERSE = UTRANS_REVERSE; 46 47 static Hashtable* SPECIAL_INVERSES = nullptr; 48 static UInitOnce gSpecialInversesInitOnce {}; 49 50 /** 51 * The mutex controlling access to SPECIAL_INVERSES 52 */ 53 static UMutex LOCK; 54 55 TransliteratorIDParser::Specs::Specs(const UnicodeString& s, const UnicodeString& t, 56 const UnicodeString& v, UBool sawS, 57 const UnicodeString& f) { 58 source = s; 59 target = t; 60 variant = v; 61 sawSource = sawS; 62 filter = f; 63 } 64 65 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b, 66 const UnicodeString& f) { 67 canonID = c; 68 basicID = b; 69 filter = f; 70 } 71 72 TransliteratorIDParser::SingleID::SingleID(const UnicodeString& c, const UnicodeString& b) { 73 canonID = c; 74 basicID = b; 75 } 76 77 Transliterator* TransliteratorIDParser::SingleID::createInstance() { 78 Transliterator* t; 79 if (basicID.length() == 0) { 80 t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), &canonID); 81 } else { 82 t = createBasicInstance(basicID, &canonID); 83 } 84 if (t != nullptr) { 85 if (filter.length() != 0) { 86 UErrorCode ec = U_ZERO_ERROR; 87 UnicodeSet *set = new UnicodeSet(filter, ec); 88 if (U_FAILURE(ec)) { 89 delete set; 90 } else { 91 t->adoptFilter(set); 92 } 93 } 94 } 95 return t; 96 } 97 98 99 /** 100 * Parse a single ID, that is, an ID of the general form 101 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element 102 * optional, the filters optional, and the variants optional. 103 * @param id the id to be parsed 104 * @param pos INPUT-OUTPUT parameter. On input, the position of 105 * the first character to parse. On output, the position after 106 * the last character parsed. 107 * @param dir the direction. If the direction is REVERSE then the 108 * SingleID is constructed for the reverse direction. 109 * @return a SingleID object or nullptr 110 */ 111 TransliteratorIDParser::SingleID* 112 TransliteratorIDParser::parseSingleID(const UnicodeString& id, int32_t& pos, 113 int32_t dir, UErrorCode& status) { 114 115 int32_t start = pos; 116 117 // The ID will be of the form A, A(), A(B), or (B), where 118 // A and B are filter IDs. 119 Specs* specsA = nullptr; 120 Specs* specsB = nullptr; 121 UBool sawParen = false; 122 123 // On the first pass, look for (B) or (). If this fails, then 124 // on the second pass, look for A, A(B), or A(). 125 for (int32_t pass=1; pass<=2; ++pass) { 126 if (pass == 2) { 127 specsA = parseFilterID(id, pos, true); 128 if (specsA == nullptr) { 129 pos = start; 130 return nullptr; 131 } 132 } 133 if (ICU_Utility::parseChar(id, pos, OPEN_REV)) { 134 sawParen = true; 135 if (!ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 136 specsB = parseFilterID(id, pos, true); 137 // Must close with a ')' 138 if (specsB == nullptr || !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 139 delete specsA; 140 pos = start; 141 return nullptr; 142 } 143 } 144 break; 145 } 146 } 147 148 // Assemble return results 149 SingleID* single; 150 if (sawParen) { 151 if (dir == FORWARD) { 152 SingleID* b = specsToID(specsB, FORWARD); 153 single = specsToID(specsA, FORWARD); 154 // Null pointers check 155 if (b == nullptr || single == nullptr) { 156 delete b; 157 delete single; 158 status = U_MEMORY_ALLOCATION_ERROR; 159 return nullptr; 160 } 161 single->canonID.append(OPEN_REV) 162 .append(b->canonID).append(CLOSE_REV); 163 if (specsA != nullptr) { 164 single->filter = specsA->filter; 165 } 166 delete b; 167 } else { 168 SingleID* a = specsToID(specsA, FORWARD); 169 single = specsToID(specsB, FORWARD); 170 // Check for null pointer. 171 if (a == nullptr || single == nullptr) { 172 delete a; 173 delete single; 174 status = U_MEMORY_ALLOCATION_ERROR; 175 return nullptr; 176 } 177 single->canonID.append(OPEN_REV) 178 .append(a->canonID).append(CLOSE_REV); 179 if (specsB != nullptr) { 180 single->filter = specsB->filter; 181 } 182 delete a; 183 } 184 } else { 185 // assert(specsA != nullptr); 186 if (dir == FORWARD) { 187 single = specsToID(specsA, FORWARD); 188 } else { 189 single = specsToSpecialInverse(*specsA, status); 190 if (single == nullptr) { 191 single = specsToID(specsA, REVERSE); 192 } 193 } 194 // Check for nullptr pointer 195 if (single == nullptr) { 196 status = U_MEMORY_ALLOCATION_ERROR; 197 return nullptr; 198 } 199 single->filter = specsA->filter; 200 } 201 202 delete specsA; 203 delete specsB; 204 205 return single; 206 } 207 208 /** 209 * Parse a filter ID, that is, an ID of the general form 210 * "[f1] s1-t1/v1", with the filters optional, and the variants optional. 211 * @param id the id to be parsed 212 * @param pos INPUT-OUTPUT parameter. On input, the position of 213 * the first character to parse. On output, the position after 214 * the last character parsed. 215 * @return a SingleID object or null if the parse fails 216 */ 217 TransliteratorIDParser::SingleID* 218 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos) { 219 220 int32_t start = pos; 221 222 Specs* specs = parseFilterID(id, pos, true); 223 if (specs == nullptr) { 224 pos = start; 225 return nullptr; 226 } 227 228 // Assemble return results 229 SingleID* single = specsToID(specs, FORWARD); 230 if (single != nullptr) { 231 single->filter = specs->filter; 232 } 233 delete specs; 234 return single; 235 } 236 237 /** 238 * Parse a global filter of the form "[f]" or "([f])", depending 239 * on 'withParens'. 240 * @param id the pattern the parse 241 * @param pos INPUT-OUTPUT parameter. On input, the position of 242 * the first character to parse. On output, the position after 243 * the last character parsed. 244 * @param dir the direction. 245 * @param withParens INPUT-OUTPUT parameter. On entry, if 246 * withParens is 0, then parens are disallowed. If it is 1, 247 * then parens are requires. If it is -1, then parens are 248 * optional, and the return result will be set to 0 or 1. 249 * @param canonID OUTPUT parameter. The pattern for the filter 250 * added to the canonID, either at the end, if dir is FORWARD, or 251 * at the start, if dir is REVERSE. The pattern will be enclosed 252 * in parentheses if appropriate, and will be suffixed with an 253 * ID_DELIM character. May be nullptr. 254 * @return a UnicodeSet object or nullptr. A non-nullptr results 255 * indicates a successful parse, regardless of whether the filter 256 * applies to the given direction. The caller should discard it 257 * if withParens != (dir == REVERSE). 258 */ 259 UnicodeSet* TransliteratorIDParser::parseGlobalFilter(const UnicodeString& id, int32_t& pos, 260 int32_t dir, 261 int32_t& withParens, 262 UnicodeString* canonID) { 263 UnicodeSet* filter = nullptr; 264 int32_t start = pos; 265 266 if (withParens == -1) { 267 withParens = ICU_Utility::parseChar(id, pos, OPEN_REV) ? 1 : 0; 268 } else if (withParens == 1) { 269 if (!ICU_Utility::parseChar(id, pos, OPEN_REV)) { 270 pos = start; 271 return nullptr; 272 } 273 } 274 275 ICU_Utility::skipWhitespace(id, pos, true); 276 277 if (UnicodeSet::resemblesPattern(id, pos)) { 278 ParsePosition ppos(pos); 279 UErrorCode ec = U_ZERO_ERROR; 280 filter = new UnicodeSet(id, ppos, USET_IGNORE_SPACE, nullptr, ec); 281 /* test for nullptr */ 282 if (filter == nullptr) { 283 pos = start; 284 return nullptr; 285 } 286 if (U_FAILURE(ec)) { 287 delete filter; 288 pos = start; 289 return nullptr; 290 } 291 292 UnicodeString pattern; 293 id.extractBetween(pos, ppos.getIndex(), pattern); 294 pos = ppos.getIndex(); 295 296 if (withParens == 1 && !ICU_Utility::parseChar(id, pos, CLOSE_REV)) { 297 delete filter; 298 pos = start; 299 return nullptr; 300 } 301 302 // In the forward direction, append the pattern to the 303 // canonID. In the reverse, insert it at zero, and invert 304 // the presence of parens ("A" <-> "(A)"). 305 if (canonID != nullptr) { 306 if (dir == FORWARD) { 307 if (withParens == 1) { 308 pattern.insert(0, OPEN_REV); 309 pattern.append(CLOSE_REV); 310 } 311 canonID->append(pattern).append(ID_DELIM); 312 } else { 313 if (withParens == 0) { 314 pattern.insert(0, OPEN_REV); 315 pattern.append(CLOSE_REV); 316 } 317 canonID->insert(0, pattern); 318 canonID->insert(pattern.length(), ID_DELIM); 319 } 320 } 321 } 322 323 return filter; 324 } 325 326 U_CDECL_BEGIN 327 static void U_CALLCONV _deleteSingleID(void* obj) { 328 delete (TransliteratorIDParser::SingleID*) obj; 329 } 330 331 static void U_CALLCONV _deleteTransliteratorTrIDPars(void* obj) { 332 delete (Transliterator*) obj; 333 } 334 U_CDECL_END 335 336 /** 337 * Parse a compound ID, consisting of an optional forward global 338 * filter, a separator, one or more single IDs delimited by 339 * separators, an an optional reverse global filter. The 340 * separator is a semicolon. The global filters are UnicodeSet 341 * patterns. The reverse global filter must be enclosed in 342 * parentheses. 343 * @param id the pattern the parse 344 * @param dir the direction. 345 * @param canonID OUTPUT parameter that receives the canonical ID, 346 * consisting of canonical IDs for all elements, as returned by 347 * parseSingleID(), separated by semicolons. Previous contents 348 * are discarded. 349 * @param list OUTPUT parameter that receives a list of SingleID 350 * objects representing the parsed IDs. Previous contents are 351 * discarded. 352 * @param globalFilter OUTPUT parameter that receives a pointer to 353 * a newly created global filter for this ID in this direction, or 354 * nullptr if there is none. 355 * @return true if the parse succeeds, that is, if the entire 356 * id is consumed without syntax error. 357 */ 358 UBool TransliteratorIDParser::parseCompoundID(const UnicodeString& id, int32_t dir, 359 UnicodeString& canonID, 360 UVector& list, 361 UnicodeSet*& globalFilter) { 362 UErrorCode ec = U_ZERO_ERROR; 363 int32_t i; 364 int32_t pos = 0; 365 int32_t withParens = 1; 366 list.removeAllElements(); 367 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 368 369 UnicodeSet* filter; 370 globalFilter = nullptr; 371 canonID.truncate(0); 372 373 // Parse leading global filter, if any 374 withParens = 0; // parens disallowed 375 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 376 if (filter != nullptr) { 377 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 378 // Not a global filter; backup and resume 379 canonID.truncate(0); 380 pos = 0; 381 } 382 if (dir == FORWARD) { 383 globalFilter = filter; 384 } else { 385 delete filter; 386 } 387 filter = nullptr; 388 } 389 390 UBool sawDelimiter = true; 391 for (;;) { 392 SingleID* single = parseSingleID(id, pos, dir, ec); 393 if (single == nullptr) { 394 break; 395 } 396 if (dir == FORWARD) { 397 list.adoptElement(single, ec); 398 } else { 399 list.insertElementAt(single, 0, ec); 400 } 401 if (U_FAILURE(ec)) { 402 goto FAIL; 403 } 404 if (!ICU_Utility::parseChar(id, pos, ID_DELIM)) { 405 sawDelimiter = false; 406 break; 407 } 408 } 409 410 if (list.size() == 0) { 411 goto FAIL; 412 } 413 414 // Construct canonical ID 415 for (i=0; i<list.size(); ++i) { 416 SingleID* single = static_cast<SingleID*>(list.elementAt(i)); 417 canonID.append(single->canonID); 418 if (i != (list.size()-1)) { 419 canonID.append(ID_DELIM); 420 } 421 } 422 423 // Parse trailing global filter, if any, and only if we saw 424 // a trailing delimiter after the IDs. 425 if (sawDelimiter) { 426 withParens = 1; // parens required 427 filter = parseGlobalFilter(id, pos, dir, withParens, &canonID); 428 if (filter != nullptr) { 429 // Don't require trailing ';', but parse it if present 430 ICU_Utility::parseChar(id, pos, ID_DELIM); 431 432 if (dir == REVERSE) { 433 globalFilter = filter; 434 } else { 435 delete filter; 436 } 437 filter = nullptr; 438 } 439 } 440 441 // Trailing unparsed text is a syntax error 442 ICU_Utility::skipWhitespace(id, pos, true); 443 if (pos != id.length()) { 444 goto FAIL; 445 } 446 447 list.setDeleter(save); 448 return true; 449 450 FAIL: 451 list.removeAllElements(); 452 list.setDeleter(save); 453 delete globalFilter; 454 globalFilter = nullptr; 455 return false; 456 } 457 458 /** 459 * Convert the elements of the 'list' vector, which are SingleID 460 * objects, into actual Transliterator objects. In the course of 461 * this, some (or all) entries may be removed. If all entries 462 * are removed, the nullptr transliterator will be added. 463 * 464 * Delete entries with empty basicIDs; these are generated by 465 * elements like "(A)" in the forward direction, or "A()" in 466 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert 467 * SingleID entries to actual transliterators. 468 * 469 * @param list vector of SingleID objects. On exit, vector 470 * of one or more Transliterators. 471 * @return new value of insertIndex. The index will shift if 472 * there are empty items, like "(Lower)", with indices less than 473 * insertIndex. 474 */ 475 void TransliteratorIDParser::instantiateList(UVector& list, 476 UErrorCode& ec) { 477 UVector tlist(ec); 478 if (U_FAILURE(ec)) { 479 goto RETURN; 480 } 481 tlist.setDeleter(_deleteTransliteratorTrIDPars); 482 483 Transliterator* t; 484 int32_t i; 485 for (i=0; i<=list.size(); ++i) { // [sic]: i<=list.size() 486 // We run the loop too long by one, so we can 487 // do an insert after the last element 488 if (i==list.size()) { 489 break; 490 } 491 492 SingleID* single = static_cast<SingleID*>(list.elementAt(i)); 493 if (single->basicID.length() != 0) { 494 t = single->createInstance(); 495 if (t == nullptr) { 496 ec = U_INVALID_ID; 497 goto RETURN; 498 } 499 tlist.adoptElement(t, ec); 500 if (U_FAILURE(ec)) { 501 goto RETURN; 502 } 503 } 504 } 505 506 // An empty list is equivalent to a nullptr transliterator. 507 if (tlist.size() == 0) { 508 t = createBasicInstance(UnicodeString(true, ANY_NULL, 8), nullptr); 509 if (t == nullptr) { 510 // Should never happen 511 ec = U_INTERNAL_TRANSLITERATOR_ERROR; 512 } 513 tlist.adoptElement(t, ec); 514 } 515 516 RETURN: 517 518 UObjectDeleter *save = list.setDeleter(_deleteSingleID); 519 list.removeAllElements(); 520 521 if (U_SUCCESS(ec)) { 522 list.setDeleter(_deleteTransliteratorTrIDPars); 523 524 while (tlist.size() > 0) { 525 t = static_cast<Transliterator*>(tlist.orphanElementAt(0)); 526 list.adoptElement(t, ec); 527 if (U_FAILURE(ec)) { 528 list.removeAllElements(); 529 break; 530 } 531 } 532 } 533 534 list.setDeleter(save); 535 } 536 537 /** 538 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T, 539 * S-T/V, or S/V-T. If the source is missing, return a source of 540 * ANY. 541 * @param id the id string, in any of several forms 542 * @return an array of 4 strings: source, target, variant, and 543 * isSourcePresent. If the source is not present, ANY will be 544 * given as the source, and isSourcePresent will be nullptr. Otherwise 545 * isSourcePresent will be non-nullptr. The target may be empty if the 546 * id is not well-formed. The variant may be empty. 547 */ 548 void TransliteratorIDParser::IDtoSTV(const UnicodeString& id, 549 UnicodeString& source, 550 UnicodeString& target, 551 UnicodeString& variant, 552 UBool& isSourcePresent) { 553 source.setTo(ANY, 3); 554 target.truncate(0); 555 variant.truncate(0); 556 557 int32_t sep = id.indexOf(TARGET_SEP); 558 int32_t var = id.indexOf(VARIANT_SEP); 559 if (var < 0) { 560 var = id.length(); 561 } 562 isSourcePresent = false; 563 564 if (sep < 0) { 565 // Form: T/V or T (or /V) 566 id.extractBetween(0, var, target); 567 id.extractBetween(var, id.length(), variant); 568 } else if (sep < var) { 569 // Form: S-T/V or S-T (or -T/V or -T) 570 if (sep > 0) { 571 id.extractBetween(0, sep, source); 572 isSourcePresent = true; 573 } 574 id.extractBetween(++sep, var, target); 575 id.extractBetween(var, id.length(), variant); 576 } else { 577 // Form: (S/V-T or /V-T) 578 if (var > 0) { 579 id.extractBetween(0, var, source); 580 isSourcePresent = true; 581 } 582 id.extractBetween(var, sep++, variant); 583 id.extractBetween(sep, id.length(), target); 584 } 585 586 if (variant.length() > 0) { 587 variant.remove(0, 1); 588 } 589 } 590 591 /** 592 * Given source, target, and variant strings, concatenate them into a 593 * full ID. If the source is empty, then "Any" will be used for the 594 * source, so the ID will always be of the form s-t/v or s-t. 595 */ 596 void TransliteratorIDParser::STVtoID(const UnicodeString& source, 597 const UnicodeString& target, 598 const UnicodeString& variant, 599 UnicodeString& id) { 600 id = source; 601 if (id.length() == 0) { 602 id.setTo(ANY, 3); 603 } 604 id.append(TARGET_SEP).append(target); 605 if (variant.length() != 0) { 606 id.append(VARIANT_SEP).append(variant); 607 } 608 // NUL-terminate the ID string for getTerminatedBuffer. 609 // This prevents valgrind and Purify warnings. 610 id.append(static_cast<char16_t>(0)); 611 id.truncate(id.length()-1); 612 } 613 614 /** 615 * Register two targets as being inverses of one another. For 616 * example, calling registerSpecialInverse("NFC", "NFD", true) causes 617 * Transliterator to form the following inverse relationships: 618 * 619 * <pre>NFC => NFD 620 * Any-NFC => Any-NFD 621 * NFD => NFC 622 * Any-NFD => Any-NFC</pre> 623 * 624 * (Without the special inverse registration, the inverse of NFC 625 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but 626 * that the presence or absence of "Any-" is preserved. 627 * 628 * <p>The relationship is symmetrical; registering (a, b) is 629 * equivalent to registering (b, a). 630 * 631 * <p>The relevant IDs must still be registered separately as 632 * factories or classes. 633 * 634 * <p>Only the targets are specified. Special inverses always 635 * have the form Any-Target1 <=> Any-Target2. The target should 636 * have canonical casing (the casing desired to be produced when 637 * an inverse is formed) and should contain no whitespace or other 638 * extraneous characters. 639 * 640 * @param target the target against which to register the inverse 641 * @param inverseTarget the inverse of target, that is 642 * Any-target.getInverse() => Any-inverseTarget 643 * @param bidirectional if true, register the reverse relation 644 * as well, that is, Any-inverseTarget.getInverse() => Any-target 645 */ 646 void TransliteratorIDParser::registerSpecialInverse(const UnicodeString& target, 647 const UnicodeString& inverseTarget, 648 UBool bidirectional, 649 UErrorCode &status) { 650 umtx_initOnce(gSpecialInversesInitOnce, init, status); 651 if (U_FAILURE(status)) { 652 return; 653 } 654 655 // If target == inverseTarget then force bidirectional => false 656 if (bidirectional && 0==target.caseCompare(inverseTarget, U_FOLD_CASE_DEFAULT)) { 657 bidirectional = false; 658 } 659 660 Mutex lock(&LOCK); 661 662 UnicodeString *tempus = new UnicodeString(inverseTarget); // Used for null pointer check before usage. 663 if (tempus == nullptr) { 664 status = U_MEMORY_ALLOCATION_ERROR; 665 return; 666 } 667 SPECIAL_INVERSES->put(target, tempus, status); 668 if (bidirectional) { 669 tempus = new UnicodeString(target); 670 if (tempus == nullptr) { 671 status = U_MEMORY_ALLOCATION_ERROR; 672 return; 673 } 674 SPECIAL_INVERSES->put(inverseTarget, tempus, status); 675 } 676 } 677 678 //---------------------------------------------------------------- 679 // Private implementation 680 //---------------------------------------------------------------- 681 682 /** 683 * Parse an ID into component pieces. Take IDs of the form T, 684 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a 685 * source of ANY. 686 * @param id the id string, in any of several forms 687 * @param pos INPUT-OUTPUT parameter. On input, pos is the 688 * offset of the first character to parse in id. On output, 689 * pos is the offset after the last parsed character. If the 690 * parse failed, pos will be unchanged. 691 * @param allowFilter2 if true, a UnicodeSet pattern is allowed 692 * at any location between specs or delimiters, and is returned 693 * as the fifth string in the array. 694 * @return a Specs object, or nullptr if the parse failed. If 695 * neither source nor target was seen in the parsed id, then the 696 * parse fails. If allowFilter is true, then the parsed filter 697 * pattern is returned in the Specs object, otherwise the returned 698 * filter reference is nullptr. If the parse fails for any reason 699 * nullptr is returned. 700 */ 701 TransliteratorIDParser::Specs* 702 TransliteratorIDParser::parseFilterID(const UnicodeString& id, int32_t& pos, 703 UBool allowFilter) { 704 UnicodeString first; 705 UnicodeString source; 706 UnicodeString target; 707 UnicodeString variant; 708 UnicodeString filter; 709 char16_t delimiter = 0; 710 int32_t specCount = 0; 711 int32_t start = pos; 712 713 // This loop parses one of the following things with each 714 // pass: a filter, a delimiter character (either '-' or '/'), 715 // or a spec (source, target, or variant). 716 for (;;) { 717 ICU_Utility::skipWhitespace(id, pos, true); 718 if (pos == id.length()) { 719 break; 720 } 721 722 // Parse filters 723 if (allowFilter && filter.length() == 0 && 724 UnicodeSet::resemblesPattern(id, pos)) { 725 726 ParsePosition ppos(pos); 727 UErrorCode ec = U_ZERO_ERROR; 728 UnicodeSet set(id, ppos, USET_IGNORE_SPACE, nullptr, ec); 729 if (U_FAILURE(ec)) { 730 pos = start; 731 return nullptr; 732 } 733 id.extractBetween(pos, ppos.getIndex(), filter); 734 pos = ppos.getIndex(); 735 continue; 736 } 737 738 if (delimiter == 0) { 739 char16_t c = id.charAt(pos); 740 if ((c == TARGET_SEP && target.length() == 0) || 741 (c == VARIANT_SEP && variant.length() == 0)) { 742 delimiter = c; 743 ++pos; 744 continue; 745 } 746 } 747 748 // We are about to try to parse a spec with no delimiter 749 // when we can no longer do so (we can only do so at the 750 // start); break. 751 if (delimiter == 0 && specCount > 0) { 752 break; 753 } 754 755 UnicodeString spec = ICU_Utility::parseUnicodeIdentifier(id, pos); 756 if (spec.length() == 0) { 757 // Note that if there was a trailing delimiter, we 758 // consume it. So Foo-, Foo/, Foo-Bar/, and Foo/Bar- 759 // are legal. 760 break; 761 } 762 763 switch (delimiter) { 764 case 0: 765 first = spec; 766 break; 767 case TARGET_SEP: 768 target = spec; 769 break; 770 case VARIANT_SEP: 771 variant = spec; 772 break; 773 } 774 ++specCount; 775 delimiter = 0; 776 } 777 778 // A spec with no prior character is either source or target, 779 // depending on whether an explicit "-target" was seen. 780 if (first.length() != 0) { 781 if (target.length() == 0) { 782 target = first; 783 } else { 784 source = first; 785 } 786 } 787 788 // Must have either source or target 789 if (source.length() == 0 && target.length() == 0) { 790 pos = start; 791 return nullptr; 792 } 793 794 // Empty source or target defaults to ANY 795 UBool sawSource = true; 796 if (source.length() == 0) { 797 source.setTo(ANY, 3); 798 sawSource = false; 799 } 800 if (target.length() == 0) { 801 target.setTo(ANY, 3); 802 } 803 804 return new Specs(source, target, variant, sawSource, filter); 805 } 806 807 /** 808 * Givens a Spec object, convert it to a SingleID object. The 809 * Spec object is a more unprocessed parse result. The SingleID 810 * object contains information about canonical and basic IDs. 811 * @return a SingleID; never returns nullptr. Returned object always 812 * has 'filter' field of nullptr. 813 */ 814 TransliteratorIDParser::SingleID* 815 TransliteratorIDParser::specsToID(const Specs* specs, int32_t dir) { 816 UnicodeString canonID; 817 UnicodeString basicID; 818 UnicodeString basicPrefix; 819 if (specs != nullptr) { 820 UnicodeString buf; 821 if (dir == FORWARD) { 822 if (specs->sawSource) { 823 buf.append(specs->source).append(TARGET_SEP); 824 } else { 825 basicPrefix = specs->source; 826 basicPrefix.append(TARGET_SEP); 827 } 828 buf.append(specs->target); 829 } else { 830 buf.append(specs->target).append(TARGET_SEP).append(specs->source); 831 } 832 if (specs->variant.length() != 0) { 833 buf.append(VARIANT_SEP).append(specs->variant); 834 } 835 basicID = basicPrefix; 836 basicID.append(buf); 837 if (specs->filter.length() != 0) { 838 buf.insert(0, specs->filter); 839 } 840 canonID = buf; 841 } 842 return new SingleID(canonID, basicID); 843 } 844 845 /** 846 * Given a Specs object, return a SingleID representing the 847 * special inverse of that ID. If there is no special inverse 848 * then return nullptr. 849 * @return a SingleID or nullptr. Returned object always has 850 * 'filter' field of nullptr. 851 */ 852 TransliteratorIDParser::SingleID* 853 TransliteratorIDParser::specsToSpecialInverse(const Specs& specs, UErrorCode &status) { 854 if (0!=specs.source.caseCompare(ANY, 3, U_FOLD_CASE_DEFAULT)) { 855 return nullptr; 856 } 857 umtx_initOnce(gSpecialInversesInitOnce, init, status); 858 if (U_FAILURE(status)) { 859 return nullptr; 860 } 861 862 UnicodeString* inverseTarget; 863 864 umtx_lock(&LOCK); 865 inverseTarget = static_cast<UnicodeString*>(SPECIAL_INVERSES->get(specs.target)); 866 umtx_unlock(&LOCK); 867 868 if (inverseTarget != nullptr) { 869 // If the original ID contained "Any-" then make the 870 // special inverse "Any-Foo"; otherwise make it "Foo". 871 // So "Any-NFC" => "Any-NFD" but "NFC" => "NFD". 872 UnicodeString buf; 873 if (specs.filter.length() != 0) { 874 buf.append(specs.filter); 875 } 876 if (specs.sawSource) { 877 buf.append(ANY, 3).append(TARGET_SEP); 878 } 879 buf.append(*inverseTarget); 880 881 UnicodeString basicID(true, ANY, 3); 882 basicID.append(TARGET_SEP).append(*inverseTarget); 883 884 if (specs.variant.length() != 0) { 885 buf.append(VARIANT_SEP).append(specs.variant); 886 basicID.append(VARIANT_SEP).append(specs.variant); 887 } 888 return new SingleID(buf, basicID); 889 } 890 return nullptr; 891 } 892 893 /** 894 * Glue method to get around access problems in C++. This would 895 * ideally be inline but we want to avoid a circular header 896 * dependency. 897 */ 898 Transliterator* TransliteratorIDParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 899 return Transliterator::createBasicInstance(id, canonID); 900 } 901 902 /** 903 * Initialize static memory. Called through umtx_initOnce only. 904 */ 905 void U_CALLCONV TransliteratorIDParser::init(UErrorCode &status) { 906 U_ASSERT(SPECIAL_INVERSES == nullptr); 907 ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup); 908 909 SPECIAL_INVERSES = new Hashtable(true, status); 910 if (SPECIAL_INVERSES == nullptr) { 911 status = U_MEMORY_ALLOCATION_ERROR; 912 return; 913 } 914 SPECIAL_INVERSES->setValueDeleter(uprv_deleteUObject); 915 } 916 917 /** 918 * Free static memory. 919 */ 920 void TransliteratorIDParser::cleanup() { 921 if (SPECIAL_INVERSES) { 922 delete SPECIAL_INVERSES; 923 SPECIAL_INVERSES = nullptr; 924 } 925 gSpecialInversesInitOnce.reset(); 926 } 927 928 U_NAMESPACE_END 929 930 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 931 932 //eof