cpdtrans.cpp (21638B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/unifilt.h" 18 #include "unicode/uniset.h" 19 #include "cpdtrans.h" 20 #include "uvector.h" 21 #include "tridpars.h" 22 #include "cmemory.h" 23 24 // keep in sync with Transliterator 25 //static const char16_t ID_SEP = 0x002D; /*-*/ 26 static const char16_t ID_DELIM = 0x003B; /*;*/ 27 static const char16_t NEWLINE = 10; 28 29 static const char16_t COLON_COLON[] = {0x3A, 0x3A, 0}; //"::" 30 31 U_NAMESPACE_BEGIN 32 33 const char16_t CompoundTransliterator::PASS_STRING[] = { 0x0025, 0x0050, 0x0061, 0x0073, 0x0073, 0 }; // "%Pass" 34 35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator) 36 37 /** 38 * Constructs a new compound transliterator given an array of 39 * transliterators. The array of transliterators may be of any 40 * length, including zero or one, however, useful compound 41 * transliterators have at least two components. 42 * @param transliterators array of <code>Transliterator</code> 43 * objects 44 * @param transliteratorCount The number of 45 * <code>Transliterator</code> objects in transliterators. 46 * @param filter the filter. Any character for which 47 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be 48 * altered by this transliterator. If <tt>filter</tt> is 49 * <tt>null</tt> then no filtering is applied. 50 */ 51 CompoundTransliterator::CompoundTransliterator( 52 Transliterator* const transliterators[], 53 int32_t transliteratorCount, 54 UnicodeFilter* adoptedFilter) : 55 Transliterator(joinIDs(transliterators, transliteratorCount), adoptedFilter), 56 trans(nullptr), count(0), numAnonymousRBTs(0) { 57 setTransliterators(transliterators, transliteratorCount); 58 } 59 60 /** 61 * Splits an ID of the form "ID;ID;..." into a compound using each 62 * of the IDs. 63 * @param id of above form 64 * @param forward if false, does the list in reverse order, and 65 * takes the inverse of each ID. 66 */ 67 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, 68 UTransDirection direction, 69 UnicodeFilter* adoptedFilter, 70 UParseError& /*parseError*/, 71 UErrorCode& status) : 72 Transliterator(id, adoptedFilter), 73 trans(nullptr), numAnonymousRBTs(0) { 74 // TODO add code for parseError...currently unused, but 75 // later may be used by parsing code... 76 init(id, direction, true, status); 77 } 78 79 CompoundTransliterator::CompoundTransliterator(const UnicodeString& id, 80 UParseError& /*parseError*/, 81 UErrorCode& status) : 82 Transliterator(id, nullptr), // set filter to 0 here! 83 trans(nullptr), numAnonymousRBTs(0) { 84 // TODO add code for parseError...currently unused, but 85 // later may be used by parsing code... 86 init(id, UTRANS_FORWARD, true, status); 87 } 88 89 90 /** 91 * Private constructor for use of TransliteratorAlias 92 */ 93 CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID, 94 UVector& list, 95 UnicodeFilter* adoptedFilter, 96 int32_t anonymousRBTs, 97 UParseError& /*parseError*/, 98 UErrorCode& status) : 99 Transliterator(newID, adoptedFilter), 100 trans(nullptr), numAnonymousRBTs(anonymousRBTs) 101 { 102 init(list, UTRANS_FORWARD, false, status); 103 } 104 105 /** 106 * Private constructor for Transliterator from a vector of 107 * transliterators. The caller is responsible for fixing up the 108 * ID. 109 */ 110 CompoundTransliterator::CompoundTransliterator(UVector& list, 111 UParseError& /*parseError*/, 112 UErrorCode& status) : 113 Transliterator(UnicodeString(), nullptr), 114 trans(nullptr), numAnonymousRBTs(0) 115 { 116 // TODO add code for parseError...currently unused, but 117 // later may be used by parsing code... 118 init(list, UTRANS_FORWARD, false, status); 119 // assume caller will fixup ID 120 } 121 122 CompoundTransliterator::CompoundTransliterator(UVector& list, 123 int32_t anonymousRBTs, 124 UParseError& /*parseError*/, 125 UErrorCode& status) : 126 Transliterator(UnicodeString(), nullptr), 127 trans(nullptr), numAnonymousRBTs(anonymousRBTs) 128 { 129 init(list, UTRANS_FORWARD, false, status); 130 } 131 132 /** 133 * Finish constructing a transliterator: only to be called by 134 * constructors. Before calling init(), set trans and filter to nullptr. 135 * @param id the id containing ';'-separated entries 136 * @param direction either FORWARD or REVERSE 137 * @param idSplitPoint the index into id at which the 138 * adoptedSplitTransliterator should be inserted, if there is one, or 139 * -1 if there is none. 140 * @param adoptedSplitTransliterator a transliterator to be inserted 141 * before the entry at offset idSplitPoint in the id string. May be 142 * nullptr to insert no entry. 143 * @param fixReverseID if true, then reconstruct the ID of reverse 144 * entries by calling getID() of component entries. Some constructors 145 * do not require this because they apply a facade ID anyway. 146 * @param status the error code indicating success or failure 147 */ 148 void CompoundTransliterator::init(const UnicodeString& id, 149 UTransDirection direction, 150 UBool fixReverseID, 151 UErrorCode& status) { 152 // assert(trans == 0); 153 154 if (U_FAILURE(status)) { 155 return; 156 } 157 158 UVector list(status); 159 UnicodeSet* compoundFilter = nullptr; 160 UnicodeString regenID; 161 if (!TransliteratorIDParser::parseCompoundID(id, direction, 162 regenID, list, compoundFilter)) { 163 status = U_INVALID_ID; 164 delete compoundFilter; 165 return; 166 } 167 168 TransliteratorIDParser::instantiateList(list, status); 169 170 init(list, direction, fixReverseID, status); 171 172 if (compoundFilter != nullptr) { 173 adoptFilter(compoundFilter); 174 } 175 } 176 177 /** 178 * Finish constructing a transliterator: only to be called by 179 * constructors. Before calling init(), set trans and filter to nullptr. 180 * @param list a vector of transliterator objects to be adopted. It 181 * should NOT be empty. The list should be in declared order. That 182 * is, it should be in the FORWARD order; if direction is REVERSE then 183 * the list order will be reversed. 184 * @param direction either FORWARD or REVERSE 185 * @param fixReverseID if true, then reconstruct the ID of reverse 186 * entries by calling getID() of component entries. Some constructors 187 * do not require this because they apply a facade ID anyway. 188 * @param status the error code indicating success or failure 189 */ 190 void CompoundTransliterator::init(UVector& list, 191 UTransDirection direction, 192 UBool fixReverseID, 193 UErrorCode& status) { 194 // assert(trans == 0); 195 196 // Allocate array 197 if (U_SUCCESS(status)) { 198 count = list.size(); 199 trans = static_cast<Transliterator**>(uprv_malloc(count * sizeof(Transliterator*))); 200 /* test for nullptr */ 201 if (trans == nullptr) { 202 status = U_MEMORY_ALLOCATION_ERROR; 203 return; 204 } 205 } 206 207 if (U_FAILURE(status) || trans == nullptr) { 208 // assert(trans == 0); 209 return; 210 } 211 212 // Move the transliterators from the vector into an array. 213 // Reverse the order if necessary. 214 int32_t i; 215 for (i=0; i<count; ++i) { 216 int32_t j = (direction == UTRANS_FORWARD) ? i : count - 1 - i; 217 trans[i] = static_cast<Transliterator*>(list.elementAt(j)); 218 } 219 220 // If the direction is UTRANS_REVERSE then we may need to fix the 221 // ID. 222 if (direction == UTRANS_REVERSE && fixReverseID) { 223 UnicodeString newID; 224 for (i=0; i<count; ++i) { 225 if (i > 0) { 226 newID.append(ID_DELIM); 227 } 228 newID.append(trans[i]->getID()); 229 } 230 setID(newID); 231 } 232 233 computeMaximumContextLength(); 234 } 235 236 /** 237 * Return the IDs of the given list of transliterators, concatenated 238 * with ID_DELIM delimiting them. Equivalent to the perlish expression 239 * join(ID_DELIM, map($_.getID(), transliterators). 240 */ 241 UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[], 242 int32_t transCount) { 243 UnicodeString id; 244 for (int32_t i=0; i<transCount; ++i) { 245 if (i > 0) { 246 id.append(ID_DELIM); 247 } 248 id.append(transliterators[i]->getID()); 249 } 250 return id; // Return temporary 251 } 252 253 /** 254 * Copy constructor. 255 */ 256 CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) : 257 Transliterator(t), trans(nullptr), count(0), numAnonymousRBTs(-1) { 258 *this = t; 259 } 260 261 /** 262 * Destructor 263 */ 264 CompoundTransliterator::~CompoundTransliterator() { 265 freeTransliterators(); 266 } 267 268 void CompoundTransliterator::freeTransliterators() { 269 if (trans != nullptr) { 270 for (int32_t i=0; i<count; ++i) { 271 delete trans[i]; 272 } 273 uprv_free(trans); 274 } 275 trans = nullptr; 276 count = 0; 277 } 278 279 /** 280 * Assignment operator. 281 */ 282 CompoundTransliterator& CompoundTransliterator::operator=( 283 const CompoundTransliterator& t) 284 { 285 if (this == &t) { return *this; } // self-assignment: no-op 286 Transliterator::operator=(t); 287 int32_t i = 0; 288 UBool failed = false; 289 if (trans != nullptr) { 290 for (i=0; i<count; ++i) { 291 delete trans[i]; 292 trans[i] = nullptr; 293 } 294 } 295 if (t.count > count) { 296 if (trans != nullptr) { 297 uprv_free(trans); 298 } 299 trans = static_cast<Transliterator**>(uprv_malloc(t.count * sizeof(Transliterator*))); 300 } 301 count = t.count; 302 if (trans != nullptr) { 303 for (i=0; i<count; ++i) { 304 trans[i] = t.trans[i]->clone(); 305 if (trans[i] == nullptr) { 306 failed = true; 307 break; 308 } 309 } 310 } 311 312 // if memory allocation failed delete backwards trans array 313 if (failed && i > 0) { 314 int32_t n; 315 for (n = i-1; n >= 0; n--) { 316 uprv_free(trans[n]); 317 trans[n] = nullptr; 318 } 319 } 320 numAnonymousRBTs = t.numAnonymousRBTs; 321 return *this; 322 } 323 324 /** 325 * Transliterator API. 326 */ 327 CompoundTransliterator* CompoundTransliterator::clone() const { 328 return new CompoundTransliterator(*this); 329 } 330 331 /** 332 * Returns the number of transliterators in this chain. 333 * @return number of transliterators in this chain. 334 */ 335 int32_t CompoundTransliterator::getCount() const { 336 return count; 337 } 338 339 /** 340 * Returns the transliterator at the given index in this chain. 341 * @param index index into chain, from 0 to <code>getCount() - 1</code> 342 * @return transliterator at the given index 343 */ 344 const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const { 345 return *trans[index]; 346 } 347 348 void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[], 349 int32_t transCount) { 350 Transliterator** a = static_cast<Transliterator**>(uprv_malloc(transCount * sizeof(Transliterator*))); 351 if (a == nullptr) { 352 return; 353 } 354 int32_t i = 0; 355 UBool failed = false; 356 for (i=0; i<transCount; ++i) { 357 a[i] = transliterators[i]->clone(); 358 if (a[i] == nullptr) { 359 failed = true; 360 break; 361 } 362 } 363 if (failed && i > 0) { 364 int32_t n; 365 for (n = i-1; n >= 0; n--) { 366 uprv_free(a[n]); 367 a[n] = nullptr; 368 } 369 return; 370 } 371 adoptTransliterators(a, transCount); 372 } 373 374 void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[], 375 int32_t transCount) { 376 // First free trans[] and set count to zero. Once this is done, 377 // orphan the filter. Set up the new trans[]. 378 freeTransliterators(); 379 trans = adoptedTransliterators; 380 count = transCount; 381 computeMaximumContextLength(); 382 setID(joinIDs(trans, count)); 383 } 384 385 /** 386 * Append c to buf, unless buf is empty or buf already ends in c. 387 */ 388 static void _smartAppend(UnicodeString& buf, char16_t c) { 389 if (buf.length() != 0 && 390 buf.charAt(buf.length() - 1) != c) { 391 buf.append(c); 392 } 393 } 394 395 UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource, 396 UBool escapeUnprintable) const { 397 // We do NOT call toRules() on our component transliterators, in 398 // general. If we have several rule-based transliterators, this 399 // yields a concatenation of the rules -- not what we want. We do 400 // handle compound RBT transliterators specially -- those for which 401 // compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex, 402 // we do call toRules() recursively. 403 rulesSource.truncate(0); 404 if (numAnonymousRBTs >= 1 && getFilter() != nullptr) { 405 // If we are a compound RBT and if we have a global 406 // filter, then emit it at the top. 407 UnicodeString pat; 408 rulesSource.append(COLON_COLON, 2).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM); 409 } 410 for (int32_t i=0; i<count; ++i) { 411 UnicodeString rule; 412 413 // Anonymous RuleBasedTransliterators (inline rules and 414 // ::BEGIN/::END blocks) are given IDs that begin with 415 // "%Pass": use toRules() to write all the rules to the output 416 // (and insert "::Null;" if we have two in a row) 417 if (trans[i]->getID().startsWith(PASS_STRING, 5)) { 418 trans[i]->toRules(rule, escapeUnprintable); 419 if (numAnonymousRBTs > 1 && i > 0 && trans[i - 1]->getID().startsWith(PASS_STRING, 5)) 420 rule = UNICODE_STRING_SIMPLE("::Null;") + rule; 421 422 // we also use toRules() on CompoundTransliterators (which we 423 // check for by looking for a semicolon in the ID)-- this gets 424 // the list of their child transliterators output in the right 425 // format 426 } else if (trans[i]->getID().indexOf(ID_DELIM) >= 0) { 427 trans[i]->toRules(rule, escapeUnprintable); 428 429 // for everything else, use Transliterator::toRules() 430 } else { 431 trans[i]->Transliterator::toRules(rule, escapeUnprintable); 432 } 433 _smartAppend(rulesSource, NEWLINE); 434 rulesSource.append(rule); 435 _smartAppend(rulesSource, ID_DELIM); 436 } 437 return rulesSource; 438 } 439 440 /** 441 * Implement Transliterator framework 442 */ 443 void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const { 444 UnicodeSet set; 445 result.clear(); 446 for (int32_t i=0; i<count; ++i) { 447 result.addAll(trans[i]->getSourceSet(set)); 448 // Take the example of Hiragana-Latin. This is really 449 // Hiragana-Katakana; Katakana-Latin. The source set of 450 // these two is roughly [:Hiragana:] and [:Katakana:]. 451 // But the source set for the entire transliterator is 452 // actually [:Hiragana:] ONLY -- that is, the first 453 // non-empty source set. 454 455 // This is a heuristic, and not 100% reliable. 456 if (!result.isEmpty()) { 457 break; 458 } 459 } 460 } 461 462 /** 463 * Override Transliterator framework 464 */ 465 UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const { 466 UnicodeSet set; 467 result.clear(); 468 for (int32_t i=0; i<count; ++i) { 469 // This is a heuristic, and not 100% reliable. 470 result.addAll(trans[i]->getTargetSet(set)); 471 } 472 return result; 473 } 474 475 /** 476 * Implements {@link Transliterator#handleTransliterate}. 477 */ 478 void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index, 479 UBool incremental) const { 480 /* Call each transliterator with the same contextStart and 481 * start, but with the limit as modified 482 * by preceding transliterators. The start index must be 483 * reset for each transliterator to give each a chance to 484 * transliterate the text. The initial contextStart index is known 485 * to still point to the same place after each transliterator 486 * is called because each transliterator will not change the 487 * text between contextStart and the initial start index. 488 * 489 * IMPORTANT: After the first transliterator, each subsequent 490 * transliterator only gets to transliterate text committed by 491 * preceding transliterators; that is, the start (output 492 * value) of transliterator i becomes the limit (input value) 493 * of transliterator i+1. Finally, the overall limit is fixed 494 * up before we return. 495 * 496 * Assumptions we make here: 497 * (1) contextStart <= start <= limit <= contextLimit <= text.length() 498 * (2) start <= start' <= limit' ;cursor doesn't move back 499 * (3) start <= limit' ;text before cursor unchanged 500 * - start' is the value of start after calling handleKT 501 * - limit' is the value of limit after calling handleKT 502 */ 503 504 /** 505 * Example: 3 transliterators. This example illustrates the 506 * mechanics we need to implement. C, S, and L are the contextStart, 507 * start, and limit. gl is the globalLimit. contextLimit is 508 * equal to limit throughout. 509 * 510 * 1. h-u, changes hex to Unicode 511 * 512 * 4 7 a d 0 4 7 a 513 * abc/u0061/u => abca/u 514 * C S L C S L gl=f->a 515 * 516 * 2. upup, changes "x" to "XX" 517 * 518 * 4 7 a 4 7 a 519 * abca/u => abcAA/u 520 * C SL C S 521 * L gl=a->b 522 * 3. u-h, changes Unicode to hex 523 * 524 * 4 7 a 4 7 a d 0 3 525 * abcAA/u => abc/u0041/u0041/u 526 * C S L C S 527 * L gl=b->15 528 * 4. return 529 * 530 * 4 7 a d 0 3 531 * abc/u0041/u0041/u 532 * C S L 533 */ 534 535 if (count < 1) { 536 index.start = index.limit; 537 return; // Short circuit for empty compound transliterators 538 } 539 540 // compoundLimit is the limit value for the entire compound 541 // operation. We overwrite index.limit with the previous 542 // index.start. After each transliteration, we update 543 // compoundLimit for insertions or deletions that have happened. 544 int32_t compoundLimit = index.limit; 545 546 // compoundStart is the start for the entire compound 547 // operation. 548 int32_t compoundStart = index.start; 549 550 int32_t delta = 0; // delta in length 551 552 // Give each transliterator a crack at the run of characters. 553 // See comments at the top of the method for more detail. 554 for (int32_t i=0; i<count; ++i) { 555 index.start = compoundStart; // Reset start 556 int32_t limit = index.limit; 557 558 if (index.start == index.limit) { 559 // Short circuit for empty range 560 break; 561 } 562 563 trans[i]->filteredTransliterate(text, index, incremental); 564 565 // In a properly written transliterator, start == limit after 566 // handleTransliterate() returns when incremental is false. 567 // Catch cases where the subclass doesn't do this, and throw 568 // an exception. (Just pinning start to limit is a bad idea, 569 // because what's probably happening is that the subclass 570 // isn't transliterating all the way to the end, and it should 571 // in non-incremental mode.) 572 if (!incremental && index.start != index.limit) { 573 // We can't throw an exception, so just fudge things 574 index.start = index.limit; 575 } 576 577 // Cumulative delta for insertions/deletions 578 delta += index.limit - limit; 579 580 if (incremental) { 581 // In the incremental case, only allow subsequent 582 // transliterators to modify what has already been 583 // completely processed by prior transliterators. In the 584 // non-incrmental case, allow each transliterator to 585 // process the entire text. 586 index.limit = index.start; 587 } 588 } 589 590 compoundLimit += delta; 591 592 // Start is good where it is -- where the last transliterator left 593 // it. Limit needs to be put back where it was, modulo 594 // adjustments for deletions/insertions. 595 index.limit = compoundLimit; 596 } 597 598 /** 599 * Sets the length of the longest context required by this transliterator. 600 * This is <em>preceding</em> context. 601 */ 602 void CompoundTransliterator::computeMaximumContextLength() { 603 int32_t max = 0; 604 for (int32_t i=0; i<count; ++i) { 605 int32_t len = trans[i]->getMaximumContextLength(); 606 if (len > max) { 607 max = len; 608 } 609 } 610 setMaximumContextLength(max); 611 } 612 613 U_NAMESPACE_END 614 615 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 616 617 /* eof */