rbt_pars.cpp (65389B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 11/17/99 aliu Creation. 10 ********************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uobject.h" 18 #include "unicode/parseerr.h" 19 #include "unicode/parsepos.h" 20 #include "unicode/putil.h" 21 #include "unicode/uchar.h" 22 #include "unicode/ustring.h" 23 #include "unicode/uniset.h" 24 #include "unicode/utf16.h" 25 #include "cstring.h" 26 #include "funcrepl.h" 27 #include "hash.h" 28 #include "quant.h" 29 #include "rbt.h" 30 #include "rbt_data.h" 31 #include "rbt_pars.h" 32 #include "rbt_rule.h" 33 #include "strmatch.h" 34 #include "strrepl.h" 35 #include "unicode/symtable.h" 36 #include "tridpars.h" 37 #include "uvector.h" 38 #include "hash.h" 39 #include "patternprops.h" 40 #include "util.h" 41 #include "cmemory.h" 42 #include "uprops.h" 43 #include "putilimp.h" 44 45 // Operators 46 #define VARIABLE_DEF_OP ((char16_t)0x003D) /*=*/ 47 #define FORWARD_RULE_OP ((char16_t)0x003E) /*>*/ 48 #define REVERSE_RULE_OP ((char16_t)0x003C) /*<*/ 49 #define FWDREV_RULE_OP ((char16_t)0x007E) /*~*/ // internal rep of <> op 50 51 // Other special characters 52 #define QUOTE ((char16_t)0x0027) /*'*/ 53 #define ESCAPE ((char16_t)0x005C) /*\*/ 54 #define END_OF_RULE ((char16_t)0x003B) /*;*/ 55 #define RULE_COMMENT_CHAR ((char16_t)0x0023) /*#*/ 56 57 #define SEGMENT_OPEN ((char16_t)0x0028) /*(*/ 58 #define SEGMENT_CLOSE ((char16_t)0x0029) /*)*/ 59 #define CONTEXT_ANTE ((char16_t)0x007B) /*{*/ 60 #define CONTEXT_POST ((char16_t)0x007D) /*}*/ 61 #define CURSOR_POS ((char16_t)0x007C) /*|*/ 62 #define CURSOR_OFFSET ((char16_t)0x0040) /*@*/ 63 #define ANCHOR_START ((char16_t)0x005E) /*^*/ 64 #define KLEENE_STAR ((char16_t)0x002A) /***/ 65 #define ONE_OR_MORE ((char16_t)0x002B) /*+*/ 66 #define ZERO_OR_ONE ((char16_t)0x003F) /*?*/ 67 68 #define DOT ((char16_t)46) /*.*/ 69 70 static const char16_t DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; 71 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, 72 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 73 }; 74 75 // A function is denoted &Source-Target/Variant(text) 76 #define FUNCTION ((char16_t)38) /*&*/ 77 78 // Aliases for some of the syntax characters. These are provided so 79 // transliteration rules can be expressed in XML without clashing with 80 // XML syntax characters '<', '>', and '&'. 81 #define ALT_REVERSE_RULE_OP ((char16_t)0x2190) // Left Arrow 82 #define ALT_FORWARD_RULE_OP ((char16_t)0x2192) // Right Arrow 83 #define ALT_FWDREV_RULE_OP ((char16_t)0x2194) // Left Right Arrow 84 #define ALT_FUNCTION ((char16_t)0x2206) // Increment (~Greek Capital Delta) 85 86 // Special characters disallowed at the top level 87 static const char16_t ILLEGAL_TOP[] = {41,0}; // ")" 88 89 // Special characters disallowed within a segment 90 static const char16_t ILLEGAL_SEG[] = {123,125,124,64,0}; // "{}|@" 91 92 // Special characters disallowed within a function argument 93 static const char16_t ILLEGAL_FUNC[] = {94,40,46,42,43,63,123,125,124,64,0}; // "^(.*+?{}|@" 94 95 // By definition, the ANCHOR_END special character is a 96 // trailing SymbolTable.SYMBOL_REF character. 97 // private static final char ANCHOR_END = '$'; 98 99 static const char16_t gOPERATORS[] = { // "=><" 100 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 101 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 102 0 103 }; 104 105 static const char16_t HALF_ENDERS[] = { // "=><;" 106 VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP, 107 ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP, 108 END_OF_RULE, 109 0 110 }; 111 112 // These are also used in Transliterator::toRules() 113 static const int32_t ID_TOKEN_LEN = 2; 114 static const char16_t ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':' 115 116 /* 117 commented out until we do real ::BEGIN/::END functionality 118 static const int32_t BEGIN_TOKEN_LEN = 5; 119 static const char16_t BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN' 120 121 static const int32_t END_TOKEN_LEN = 3; 122 static const char16_t END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END' 123 */ 124 125 U_NAMESPACE_BEGIN 126 127 //---------------------------------------------------------------------- 128 // BEGIN ParseData 129 //---------------------------------------------------------------------- 130 131 /** 132 * This class implements the SymbolTable interface. It is used 133 * during parsing to give UnicodeSet access to variables that 134 * have been defined so far. Note that it uses variablesVector, 135 * _not_ data.setVariables. 136 */ 137 class ParseData : public UMemory, public SymbolTable { 138 public: 139 const TransliterationRuleData* data; // alias 140 141 const UVector* variablesVector; // alias 142 143 const Hashtable* variableNames; // alias 144 145 ParseData(const TransliterationRuleData* data = nullptr, 146 const UVector* variablesVector = nullptr, 147 const Hashtable* variableNames = nullptr); 148 149 virtual ~ParseData(); 150 151 virtual const UnicodeString* lookup(const UnicodeString& s) const override; 152 153 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const override; 154 155 virtual UnicodeString parseReference(const UnicodeString& text, 156 ParsePosition& pos, int32_t limit) const override; 157 /** 158 * Return true if the given character is a matcher standin or a plain 159 * character (non standin). 160 */ 161 UBool isMatcher(UChar32 ch); 162 163 /** 164 * Return true if the given character is a replacer standin or a plain 165 * character (non standin). 166 */ 167 UBool isReplacer(UChar32 ch); 168 169 private: 170 ParseData(const ParseData &other); // forbid copying of this class 171 ParseData &operator=(const ParseData &other); // forbid copying of this class 172 }; 173 174 ParseData::ParseData(const TransliterationRuleData* d, 175 const UVector* sets, 176 const Hashtable* vNames) : 177 data(d), variablesVector(sets), variableNames(vNames) {} 178 179 ParseData::~ParseData() {} 180 181 /** 182 * Implement SymbolTable API. 183 */ 184 const UnicodeString* ParseData::lookup(const UnicodeString& name) const { 185 return static_cast<const UnicodeString*>(variableNames->get(name)); 186 } 187 188 /** 189 * Implement SymbolTable API. 190 */ 191 const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const { 192 // Note that we cannot use data.lookupSet() because the 193 // set array has not been constructed yet. 194 const UnicodeFunctor* set = nullptr; 195 int32_t i = ch - data->variablesBase; 196 if (i >= 0 && i < variablesVector->size()) { 197 int32_t j = ch - data->variablesBase; 198 set = (j < variablesVector->size()) ? 199 static_cast<UnicodeFunctor*>(variablesVector->elementAt(j)) : nullptr; 200 } 201 return set; 202 } 203 204 /** 205 * Implement SymbolTable API. Parse out a symbol reference 206 * name. 207 */ 208 UnicodeString ParseData::parseReference(const UnicodeString& text, 209 ParsePosition& pos, int32_t limit) const { 210 int32_t start = pos.getIndex(); 211 int32_t i = start; 212 UnicodeString result; 213 while (i < limit) { 214 char16_t c = text.charAt(i); 215 if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) { 216 break; 217 } 218 ++i; 219 } 220 if (i == start) { // No valid name chars 221 return result; // Indicate failure with empty string 222 } 223 pos.setIndex(i); 224 text.extractBetween(start, i, result); 225 return result; 226 } 227 228 UBool ParseData::isMatcher(UChar32 ch) { 229 // Note that we cannot use data.lookup() because the 230 // set array has not been constructed yet. 231 int32_t i = ch - data->variablesBase; 232 if (i >= 0 && i < variablesVector->size()) { 233 UnicodeFunctor* f = static_cast<UnicodeFunctor*>(variablesVector->elementAt(i)); 234 return f != nullptr && f->toMatcher() != nullptr; 235 } 236 return true; 237 } 238 239 /** 240 * Return true if the given character is a replacer standin or a plain 241 * character (non standin). 242 */ 243 UBool ParseData::isReplacer(UChar32 ch) { 244 // Note that we cannot use data.lookup() because the 245 // set array has not been constructed yet. 246 int i = ch - data->variablesBase; 247 if (i >= 0 && i < variablesVector->size()) { 248 UnicodeFunctor* f = static_cast<UnicodeFunctor*>(variablesVector->elementAt(i)); 249 return f != nullptr && f->toReplacer() != nullptr; 250 } 251 return true; 252 } 253 254 //---------------------------------------------------------------------- 255 // BEGIN RuleHalf 256 //---------------------------------------------------------------------- 257 258 /** 259 * A class representing one side of a rule. This class knows how to 260 * parse half of a rule. It is tightly coupled to the method 261 * RuleBasedTransliterator.Parser.parseRule(). 262 */ 263 class RuleHalf : public UMemory { 264 265 public: 266 267 UnicodeString text; 268 269 int32_t cursor; // position of cursor in text 270 int32_t ante; // position of ante context marker '{' in text 271 int32_t post; // position of post context marker '}' in text 272 273 // Record the offset to the cursor either to the left or to the 274 // right of the key. This is indicated by characters on the output 275 // side that allow the cursor to be positioned arbitrarily within 276 // the matching text. For example, abc{def} > | @@@ xyz; changes 277 // def to xyz and moves the cursor to before abc. Offset characters 278 // must be at the start or end, and they cannot move the cursor past 279 // the ante- or postcontext text. Placeholders are only valid in 280 // output text. The length of the ante and post context is 281 // determined at runtime, because of supplementals and quantifiers. 282 int32_t cursorOffset; // only nonzero on output side 283 284 // Position of first CURSOR_OFFSET on _right_. This will be -1 285 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 286 int32_t cursorOffsetPos; 287 288 UBool anchorStart; 289 UBool anchorEnd; 290 291 /** 292 * The segment number from 1..n of the next '(' we see 293 * during parsing; 1-based. 294 */ 295 int32_t nextSegmentNumber; 296 297 TransliteratorParser& parser; 298 299 //-------------------------------------------------- 300 // Methods 301 302 RuleHalf(TransliteratorParser& parser); 303 ~RuleHalf(); 304 305 int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); 306 307 int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 308 UnicodeString& buf, 309 const UnicodeString& illegal, 310 UBool isSegment, 311 UErrorCode& status); 312 313 /** 314 * Remove context. 315 */ 316 void removeContext(); 317 318 /** 319 * Return true if this half looks like valid output, that is, does not 320 * contain quantifiers or other special input-only elements. 321 */ 322 UBool isValidOutput(TransliteratorParser& parser); 323 324 /** 325 * Return true if this half looks like valid input, that is, does not 326 * contain functions or other special output-only elements. 327 */ 328 UBool isValidInput(TransliteratorParser& parser); 329 330 int syntaxError(UErrorCode code, 331 const UnicodeString& rule, 332 int32_t start, 333 UErrorCode& status) { 334 return parser.syntaxError(code, rule, start, status); 335 } 336 337 private: 338 // Disallowed methods; no impl. 339 RuleHalf(const RuleHalf&); 340 RuleHalf& operator=(const RuleHalf&); 341 }; 342 343 RuleHalf::RuleHalf(TransliteratorParser& p) : 344 parser(p) 345 { 346 cursor = -1; 347 ante = -1; 348 post = -1; 349 cursorOffset = 0; 350 cursorOffsetPos = 0; 351 anchorStart = anchorEnd = false; 352 nextSegmentNumber = 1; 353 } 354 355 RuleHalf::~RuleHalf() { 356 } 357 358 /** 359 * Parse one side of a rule, stopping at either the limit, 360 * the END_OF_RULE character, or an operator. 361 * @return the index after the terminating character, or 362 * if limit was reached, limit 363 */ 364 int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 365 int32_t start = pos; 366 text.truncate(0); 367 pos = parseSection(rule, pos, limit, text, UnicodeString(true, ILLEGAL_TOP, -1), false, status); 368 369 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 370 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 371 } 372 373 return pos; 374 } 375 376 /** 377 * Parse a section of one side of a rule, stopping at either 378 * the limit, the END_OF_RULE character, an operator, or a 379 * segment close character. This method parses both a 380 * top-level rule half and a segment within such a rule half. 381 * It calls itself recursively to parse segments and nested 382 * segments. 383 * @param buf buffer into which to accumulate the rule pattern 384 * characters, either literal characters from the rule or 385 * standins for UnicodeMatcher objects including segments. 386 * @param illegal the set of special characters that is illegal during 387 * this parse. 388 * @param isSegment if true, then we've already seen a '(' and 389 * pos on entry points right after it. Accumulate everything 390 * up to the closing ')', put it in a segment matcher object, 391 * generate a standin for it, and add the standin to buf. As 392 * a side effect, update the segments vector with a reference 393 * to the segment matcher. This works recursively for nested 394 * segments. If isSegment is false, just accumulate 395 * characters into buf. 396 * @return the index after the terminating character, or 397 * if limit was reached, limit 398 */ 399 int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit, 400 UnicodeString& buf, 401 const UnicodeString& illegal, 402 UBool isSegment, UErrorCode& status) { 403 int32_t start = pos; 404 ParsePosition pp; 405 UnicodeString scratch; 406 UBool done = false; 407 int32_t quoteStart = -1; // Most recent 'single quoted string' 408 int32_t quoteLimit = -1; 409 int32_t varStart = -1; // Most recent $variableReference 410 int32_t varLimit = -1; 411 int32_t bufStart = buf.length(); 412 413 while (pos < limit && !done) { 414 // Since all syntax characters are in the BMP, fetching 415 // 16-bit code units suffices here. 416 char16_t c = rule.charAt(pos++); 417 if (PatternProps::isWhiteSpace(c)) { 418 // Ignore whitespace. Note that this is not Unicode 419 // spaces, but Java spaces -- a subset, representing 420 // whitespace likely to be seen in code. 421 continue; 422 } 423 if (u_strchr(HALF_ENDERS, c) != nullptr) { 424 if (isSegment) { 425 // Unclosed segment 426 return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status); 427 } 428 break; 429 } 430 if (anchorEnd) { 431 // Text after a presumed end anchor is a syntax err 432 return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status); 433 } 434 if (UnicodeSet::resemblesPattern(rule, pos-1)) { 435 pp.setIndex(pos-1); // Backup to opening '[' 436 buf.append(parser.parseSet(rule, pp, status)); 437 if (U_FAILURE(status)) { 438 return syntaxError(U_MALFORMED_SET, rule, start, status); 439 } 440 pos = pp.getIndex(); 441 continue; 442 } 443 // Handle escapes 444 if (c == ESCAPE) { 445 if (pos == limit) { 446 return syntaxError(U_TRAILING_BACKSLASH, rule, start, status); 447 } 448 UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\' 449 if (escaped == static_cast<UChar32>(-1)) { 450 return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status); 451 } 452 if (!parser.checkVariableRange(escaped)) { 453 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 454 } 455 buf.append(escaped); 456 continue; 457 } 458 // Handle quoted matter 459 if (c == QUOTE) { 460 int32_t iq = rule.indexOf(QUOTE, pos); 461 if (iq == pos) { 462 buf.append(c); // Parse [''] outside quotes as ['] 463 ++pos; 464 } else { 465 /* This loop picks up a run of quoted text of the 466 * form 'aaaa' each time through. If this run 467 * hasn't really ended ('aaaa''bbbb') then it keeps 468 * looping, each time adding on a new run. When it 469 * reaches the final quote it breaks. 470 */ 471 quoteStart = buf.length(); 472 for (;;) { 473 if (iq < 0) { 474 return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status); 475 } 476 scratch.truncate(0); 477 rule.extractBetween(pos, iq, scratch); 478 buf.append(scratch); 479 pos = iq+1; 480 if (pos < limit && rule.charAt(pos) == QUOTE) { 481 // Parse [''] inside quotes as ['] 482 iq = rule.indexOf(QUOTE, pos+1); 483 // Continue looping 484 } else { 485 break; 486 } 487 } 488 quoteLimit = buf.length(); 489 490 for (iq=quoteStart; iq<quoteLimit; ++iq) { 491 if (!parser.checkVariableRange(buf.charAt(iq))) { 492 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 493 } 494 } 495 } 496 continue; 497 } 498 499 if (!parser.checkVariableRange(c)) { 500 return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status); 501 } 502 503 if (illegal.indexOf(c) >= 0) { 504 syntaxError(U_ILLEGAL_CHARACTER, rule, start, status); 505 } 506 507 switch (c) { 508 509 //------------------------------------------------------ 510 // Elements allowed within and out of segments 511 //------------------------------------------------------ 512 case ANCHOR_START: 513 if (buf.length() == 0 && !anchorStart) { 514 anchorStart = true; 515 } else { 516 return syntaxError(U_MISPLACED_ANCHOR_START, 517 rule, start, status); 518 } 519 break; 520 case SEGMENT_OPEN: 521 { 522 // bufSegStart is the offset in buf to the first 523 // character of the segment we are parsing. 524 int32_t bufSegStart = buf.length(); 525 526 // Record segment number now, since nextSegmentNumber 527 // will be incremented during the call to parseSection 528 // if there are nested segments. 529 int32_t segmentNumber = nextSegmentNumber++; // 1-based 530 531 // Parse the segment 532 pos = parseSection(rule, pos, limit, buf, UnicodeString(true, ILLEGAL_SEG, -1), true, status); 533 534 // After parsing a segment, the relevant characters are 535 // in buf, starting at offset bufSegStart. Extract them 536 // into a string matcher, and replace them with a 537 // standin for that matcher. 538 LocalPointer<StringMatcher> m(new StringMatcher(buf, bufSegStart, buf.length(), 539 segmentNumber, *parser.curData), status); 540 if (U_FAILURE(status)) { 541 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 542 } 543 544 // Record and associate object and segment number 545 parser.setSegmentObject(segmentNumber, m.orphan(), status); 546 buf.truncate(bufSegStart); 547 buf.append(parser.getSegmentStandin(segmentNumber, status)); 548 } 549 break; 550 case FUNCTION: 551 case ALT_FUNCTION: 552 { 553 int32_t iref = pos; 554 TransliteratorIDParser::SingleID* single = 555 TransliteratorIDParser::parseFilterID(rule, iref); 556 // The next character MUST be a segment open 557 if (single == nullptr || 558 !ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) { 559 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 560 } 561 562 Transliterator *t = single->createInstance(); 563 delete single; 564 if (t == nullptr) { 565 return syntaxError(U_INVALID_FUNCTION, rule, start, status); 566 } 567 568 // bufSegStart is the offset in buf to the first 569 // character of the segment we are parsing. 570 int32_t bufSegStart = buf.length(); 571 572 // Parse the segment 573 pos = parseSection(rule, iref, limit, buf, UnicodeString(true, ILLEGAL_FUNC, -1), true, status); 574 575 // After parsing a segment, the relevant characters are 576 // in buf, starting at offset bufSegStart. 577 UnicodeString output; 578 buf.extractBetween(bufSegStart, buf.length(), output); 579 LocalPointer<FunctionReplacer> r( 580 new FunctionReplacer(t, new StringReplacer(output, parser.curData)), status); 581 if (U_FAILURE(status)) { 582 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 583 } 584 585 // Replace the buffer contents with a stand-in 586 buf.truncate(bufSegStart); 587 buf.append(parser.generateStandInFor(r.orphan(), status)); 588 } 589 break; 590 case SymbolTable::SYMBOL_REF: 591 // Handle variable references and segment references "$1" .. "$9" 592 { 593 // A variable reference must be followed immediately 594 // by a Unicode identifier start and zero or more 595 // Unicode identifier part characters, or by a digit 596 // 1..9 if it is a segment reference. 597 if (pos == limit) { 598 // A variable ref character at the end acts as 599 // an anchor to the context limit, as in perl. 600 anchorEnd = true; 601 break; 602 } 603 // Parse "$1" "$2" .. "$9" .. (no upper limit) 604 c = rule.charAt(pos); 605 int32_t r = u_digit(c, 10); 606 if (r >= 1 && r <= 9) { 607 r = ICU_Utility::parseNumber(rule, pos, 10); 608 if (r < 0) { 609 return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, 610 rule, start, status); 611 } 612 buf.append(parser.getSegmentStandin(r, status)); 613 } else { 614 pp.setIndex(pos); 615 UnicodeString name = parser.parseData-> 616 parseReference(rule, pp, limit); 617 if (name.length() == 0) { 618 // This means the '$' was not followed by a 619 // valid name. Try to interpret it as an 620 // end anchor then. If this also doesn't work 621 // (if we see a following character) then signal 622 // an error. 623 anchorEnd = true; 624 break; 625 } 626 pos = pp.getIndex(); 627 // If this is a variable definition statement, 628 // then the LHS variable will be undefined. In 629 // that case appendVariableDef() will append the 630 // special placeholder char variableLimit-1. 631 varStart = buf.length(); 632 parser.appendVariableDef(name, buf, status); 633 varLimit = buf.length(); 634 } 635 } 636 break; 637 case DOT: 638 buf.append(parser.getDotStandIn(status)); 639 break; 640 case KLEENE_STAR: 641 case ONE_OR_MORE: 642 case ZERO_OR_ONE: 643 // Quantifiers. We handle single characters, quoted strings, 644 // variable references, and segments. 645 // a+ matches aaa 646 // 'foo'+ matches foofoofoo 647 // $v+ matches xyxyxy if $v == xy 648 // (seg)+ matches segsegseg 649 { 650 if (isSegment && buf.length() == bufStart) { 651 // The */+ immediately follows '(' 652 return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status); 653 } 654 655 int32_t qstart, qlimit; 656 // The */+ follows an isolated character or quote 657 // or variable reference 658 if (buf.length() == quoteLimit) { 659 // The */+ follows a 'quoted string' 660 qstart = quoteStart; 661 qlimit = quoteLimit; 662 } else if (buf.length() == varLimit) { 663 // The */+ follows a $variableReference 664 qstart = varStart; 665 qlimit = varLimit; 666 } else { 667 // The */+ follows a single character, possibly 668 // a segment standin 669 qstart = buf.length() - 1; 670 qlimit = qstart + 1; 671 } 672 673 LocalPointer<UnicodeFunctor> m( 674 new StringMatcher(buf, qstart, qlimit, 0, *parser.curData), status); 675 if (U_FAILURE(status)) { 676 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 677 } 678 int32_t min = 0; 679 int32_t max = Quantifier::MAX; 680 switch (c) { 681 case ONE_OR_MORE: 682 min = 1; 683 break; 684 case ZERO_OR_ONE: 685 min = 0; 686 max = 1; 687 break; 688 // case KLEENE_STAR: 689 // do nothing -- min, max already set 690 } 691 LocalPointer<UnicodeFunctor> m2(new Quantifier(m.getAlias(), min, max), status); 692 if (m2.isValid()) { 693 m.orphan(); 694 } 695 if (U_FAILURE(status)) { 696 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 697 } 698 m = std::move(m2); 699 buf.truncate(qstart); 700 buf.append(parser.generateStandInFor(m.orphan(), status)); 701 } 702 break; 703 704 //------------------------------------------------------ 705 // Elements allowed ONLY WITHIN segments 706 //------------------------------------------------------ 707 case SEGMENT_CLOSE: 708 // assert(isSegment); 709 // We're done parsing a segment. 710 done = true; 711 break; 712 713 //------------------------------------------------------ 714 // Elements allowed ONLY OUTSIDE segments 715 //------------------------------------------------------ 716 case CONTEXT_ANTE: 717 if (ante >= 0) { 718 return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status); 719 } 720 ante = buf.length(); 721 break; 722 case CONTEXT_POST: 723 if (post >= 0) { 724 return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status); 725 } 726 post = buf.length(); 727 break; 728 case CURSOR_POS: 729 if (cursor >= 0) { 730 return syntaxError(U_MULTIPLE_CURSORS, rule, start, status); 731 } 732 cursor = buf.length(); 733 break; 734 case CURSOR_OFFSET: 735 if (cursorOffset < 0) { 736 if (buf.length() > 0) { 737 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 738 } 739 --cursorOffset; 740 } else if (cursorOffset > 0) { 741 if (buf.length() != cursorOffsetPos || cursor >= 0) { 742 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 743 } 744 ++cursorOffset; 745 } else { 746 if (cursor == 0 && buf.length() == 0) { 747 cursorOffset = -1; 748 } else if (cursor < 0) { 749 cursorOffsetPos = buf.length(); 750 cursorOffset = 1; 751 } else { 752 return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status); 753 } 754 } 755 break; 756 757 758 //------------------------------------------------------ 759 // Non-special characters 760 //------------------------------------------------------ 761 default: 762 // Disallow unquoted characters other than [0-9A-Za-z] 763 // in the printable ASCII range. These characters are 764 // reserved for possible future use. 765 if (c >= 0x0021 && c <= 0x007E && 766 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) || 767 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) || 768 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) { 769 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 770 } 771 buf.append(c); 772 break; 773 } 774 } 775 776 return pos; 777 } 778 779 /** 780 * Remove context. 781 */ 782 void RuleHalf::removeContext() { 783 //text = text.substring(ante < 0 ? 0 : ante, 784 // post < 0 ? text.length() : post); 785 if (post >= 0) { 786 text.remove(post); 787 } 788 if (ante >= 0) { 789 text.removeBetween(0, ante); 790 } 791 ante = post = -1; 792 anchorStart = anchorEnd = false; 793 } 794 795 /** 796 * Return true if this half looks like valid output, that is, does not 797 * contain quantifiers or other special input-only elements. 798 */ 799 UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) { 800 for (int32_t i=0; i<text.length(); ) { 801 UChar32 c = text.char32At(i); 802 i += U16_LENGTH(c); 803 if (!transParser.parseData->isReplacer(c)) { 804 return false; 805 } 806 } 807 return true; 808 } 809 810 /** 811 * Return true if this half looks like valid input, that is, does not 812 * contain functions or other special output-only elements. 813 */ 814 UBool RuleHalf::isValidInput(TransliteratorParser& transParser) { 815 for (int32_t i=0; i<text.length(); ) { 816 UChar32 c = text.char32At(i); 817 i += U16_LENGTH(c); 818 if (!transParser.parseData->isMatcher(c)) { 819 return false; 820 } 821 } 822 return true; 823 } 824 825 //---------------------------------------------------------------------- 826 // PUBLIC API 827 //---------------------------------------------------------------------- 828 829 /** 830 * Constructor. 831 */ 832 TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) : 833 dataVector(statusReturn), 834 idBlockVector(statusReturn), 835 variablesVector(statusReturn), 836 segmentObjects(statusReturn) 837 { 838 idBlockVector.setDeleter(uprv_deleteUObject); 839 curData = nullptr; 840 compoundFilter = nullptr; 841 parseData = nullptr; 842 variableNames.setValueDeleter(uprv_deleteUObject); 843 } 844 845 /** 846 * Destructor. 847 */ 848 TransliteratorParser::~TransliteratorParser() { 849 while (!dataVector.isEmpty()) 850 delete static_cast<TransliterationRuleData*>(dataVector.orphanElementAt(0)); 851 delete compoundFilter; 852 delete parseData; 853 while (!variablesVector.isEmpty()) 854 delete static_cast<UnicodeFunctor*>(variablesVector.orphanElementAt(0)); 855 } 856 857 void 858 TransliteratorParser::parse(const UnicodeString& rules, 859 UTransDirection transDirection, 860 UParseError& pe, 861 UErrorCode& ec) { 862 if (U_SUCCESS(ec)) { 863 parseRules(rules, transDirection, ec); 864 pe = parseError; 865 } 866 } 867 868 /** 869 * Return the compound filter parsed by parse(). Caller owns result. 870 */ 871 UnicodeSet* TransliteratorParser::orphanCompoundFilter() { 872 UnicodeSet* f = compoundFilter; 873 compoundFilter = nullptr; 874 return f; 875 } 876 877 //---------------------------------------------------------------------- 878 // Private implementation 879 //---------------------------------------------------------------------- 880 881 /** 882 * Parse the given string as a sequence of rules, separated by newline 883 * characters ('\n'), and cause this object to implement those rules. Any 884 * previous rules are discarded. Typically this method is called exactly 885 * once, during construction. 886 * @exception IllegalArgumentException if there is a syntax error in the 887 * rules 888 */ 889 void TransliteratorParser::parseRules(const UnicodeString& rule, 890 UTransDirection theDirection, 891 UErrorCode& status) 892 { 893 // Clear error struct 894 uprv_memset(&parseError, 0, sizeof(parseError)); 895 parseError.line = parseError.offset = -1; 896 897 UBool parsingIDs = true; 898 int32_t ruleCount = 0; 899 900 while (!dataVector.isEmpty()) { 901 delete static_cast<TransliterationRuleData*>(dataVector.orphanElementAt(0)); 902 } 903 if (U_FAILURE(status)) { 904 return; 905 } 906 907 idBlockVector.removeAllElements(); 908 curData = nullptr; 909 direction = theDirection; 910 ruleCount = 0; 911 912 delete compoundFilter; 913 compoundFilter = nullptr; 914 915 while (!variablesVector.isEmpty()) { 916 delete static_cast<UnicodeFunctor*>(variablesVector.orphanElementAt(0)); 917 } 918 variableNames.removeAll(); 919 parseData = new ParseData(nullptr, &variablesVector, &variableNames); 920 if (parseData == nullptr) { 921 status = U_MEMORY_ALLOCATION_ERROR; 922 return; 923 } 924 925 dotStandIn = static_cast<char16_t>(-1); 926 927 LocalPointer<UnicodeString> tempstr; // used for memory allocation error checking 928 UnicodeString str; // scratch 929 UnicodeString idBlockResult; 930 int32_t pos = 0; 931 int32_t limit = rule.length(); 932 933 // The compound filter offset is an index into idBlockResult. 934 // If it is 0, then the compound filter occurred at the start, 935 // and it is the offset to the _start_ of the compound filter 936 // pattern. Otherwise it is the offset to the _limit_ of the 937 // compound filter pattern within idBlockResult. 938 compoundFilter = nullptr; 939 int32_t compoundFilterOffset = -1; 940 941 while (pos < limit && U_SUCCESS(status)) { 942 char16_t c = rule.charAt(pos++); 943 if (PatternProps::isWhiteSpace(c)) { 944 // Ignore leading whitespace. 945 continue; 946 } 947 // Skip lines starting with the comment character 948 if (c == RULE_COMMENT_CHAR) { 949 pos = rule.indexOf(static_cast<char16_t>(0x000A) /*\n*/, pos) + 1; 950 if (pos == 0) { 951 break; // No "\n" found; rest of rule is a comment 952 } 953 continue; // Either fall out or restart with next line 954 } 955 956 // skip empty rules 957 if (c == END_OF_RULE) 958 continue; 959 960 // keep track of how many rules we've seen 961 ++ruleCount; 962 963 // We've found the start of a rule or ID. c is its first 964 // character, and pos points past c. 965 --pos; 966 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 967 // chars left. 968 if ((pos + ID_TOKEN_LEN + 1) <= limit && 969 rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) { 970 pos += ID_TOKEN_LEN; 971 c = rule.charAt(pos); 972 while (PatternProps::isWhiteSpace(c) && pos < limit) { 973 ++pos; 974 c = rule.charAt(pos); 975 } 976 977 int32_t p = pos; 978 979 if (!parsingIDs) { 980 if (curData != nullptr) { 981 U_ASSERT(!dataVector.hasDeleter()); 982 if (direction == UTRANS_FORWARD) 983 dataVector.addElement(curData, status); 984 else 985 dataVector.insertElementAt(curData, 0, status); 986 if (U_FAILURE(status)) { 987 delete curData; 988 } 989 curData = nullptr; 990 } 991 parsingIDs = true; 992 } 993 994 TransliteratorIDParser::SingleID* id = 995 TransliteratorIDParser::parseSingleID(rule, p, direction, status); 996 if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) { 997 // Successful ::ID parse. 998 999 if (direction == UTRANS_FORWARD) { 1000 idBlockResult.append(id->canonID).append(END_OF_RULE); 1001 } else { 1002 idBlockResult.insert(0, END_OF_RULE); 1003 idBlockResult.insert(0, id->canonID); 1004 } 1005 1006 } else { 1007 // Couldn't parse an ID. Try to parse a global filter 1008 int32_t withParens = -1; 1009 UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, nullptr); 1010 if (f != nullptr) { 1011 if (ICU_Utility::parseChar(rule, p, END_OF_RULE) 1012 && (direction == UTRANS_FORWARD) == (withParens == 0)) 1013 { 1014 if (compoundFilter != nullptr) { 1015 // Multiple compound filters 1016 syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status); 1017 delete f; 1018 } else { 1019 compoundFilter = f; 1020 compoundFilterOffset = ruleCount; 1021 } 1022 } else { 1023 delete f; 1024 } 1025 } else { 1026 // Invalid ::id 1027 // Can be parsed as neither an ID nor a global filter 1028 syntaxError(U_INVALID_ID, rule, pos, status); 1029 } 1030 } 1031 delete id; 1032 pos = p; 1033 } else { 1034 if (parsingIDs) { 1035 tempstr.adoptInsteadAndCheckErrorCode(new UnicodeString(idBlockResult), status); 1036 // nullptr pointer check 1037 if (U_FAILURE(status)) { 1038 return; 1039 } 1040 U_ASSERT(idBlockVector.hasDeleter()); 1041 if (direction == UTRANS_FORWARD) 1042 idBlockVector.adoptElement(tempstr.orphan(), status); 1043 else 1044 idBlockVector.insertElementAt(tempstr.orphan(), 0, status); 1045 if (U_FAILURE(status)) { 1046 return; 1047 } 1048 idBlockResult.remove(); 1049 parsingIDs = false; 1050 curData = new TransliterationRuleData(status); 1051 // nullptr pointer check 1052 if (curData == nullptr) { 1053 status = U_MEMORY_ALLOCATION_ERROR; 1054 return; 1055 } 1056 parseData->data = curData; 1057 1058 // By default, rules use part of the private use area 1059 // E000..F8FF for variables and other stand-ins. Currently 1060 // the range F000..F8FF is typically sufficient. The 'use 1061 // variable range' pragma allows rule sets to modify this. 1062 setVariableRange(0xF000, 0xF8FF, status); 1063 } 1064 1065 if (resemblesPragma(rule, pos, limit)) { 1066 int32_t ppp = parsePragma(rule, pos, limit, status); 1067 if (ppp < 0) { 1068 syntaxError(U_MALFORMED_PRAGMA, rule, pos, status); 1069 } 1070 pos = ppp; 1071 // Parse a rule 1072 } else { 1073 pos = parseRule(rule, pos, limit, status); 1074 } 1075 } 1076 } 1077 1078 if (parsingIDs && idBlockResult.length() > 0) { 1079 tempstr.adoptInsteadAndCheckErrorCode(new UnicodeString(idBlockResult), status); 1080 // nullptr pointer check 1081 if (U_FAILURE(status)) { 1082 // TODO: Testing, forcing this path, shows many memory leaks. ICU-21701 1083 // intltest translit/TransliteratorTest/TestInstantiation 1084 return; 1085 } 1086 if (direction == UTRANS_FORWARD) 1087 idBlockVector.adoptElement(tempstr.orphan(), status); 1088 else 1089 idBlockVector.insertElementAt(tempstr.orphan(), 0, status); 1090 if (U_FAILURE(status)) { 1091 return; 1092 } 1093 } 1094 else if (!parsingIDs && curData != nullptr) { 1095 if (direction == UTRANS_FORWARD) { 1096 dataVector.addElement(curData, status); 1097 } else { 1098 dataVector.insertElementAt(curData, 0, status); 1099 } 1100 if (U_FAILURE(status)) { 1101 delete curData; 1102 curData = nullptr; 1103 } 1104 } 1105 1106 if (U_SUCCESS(status)) { 1107 // Convert the set vector to an array 1108 int32_t i, dataVectorSize = dataVector.size(); 1109 for (i = 0; i < dataVectorSize; i++) { 1110 TransliterationRuleData* data = static_cast<TransliterationRuleData*>(dataVector.elementAt(i)); 1111 data->variablesLength = variablesVector.size(); 1112 if (data->variablesLength == 0) { 1113 data->variables = nullptr; 1114 } else { 1115 data->variables = static_cast<UnicodeFunctor**>(uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*))); 1116 // nullptr pointer check 1117 if (data->variables == nullptr) { 1118 status = U_MEMORY_ALLOCATION_ERROR; 1119 return; 1120 } 1121 data->variablesAreOwned = (i == 0); 1122 } 1123 1124 for (int32_t j = 0; j < data->variablesLength; j++) { 1125 data->variables[j] = 1126 static_cast<UnicodeFunctor *>(variablesVector.elementAt(j)); 1127 } 1128 1129 data->variableNames.removeAll(); 1130 int32_t p = UHASH_FIRST; 1131 const UHashElement* he = variableNames.nextElement(p); 1132 while (he != nullptr) { 1133 UnicodeString* tempus = static_cast<UnicodeString*>(he->value.pointer)->clone(); 1134 if (tempus == nullptr) { 1135 status = U_MEMORY_ALLOCATION_ERROR; 1136 return; 1137 } 1138 data->variableNames.put(*static_cast<UnicodeString*>(he->key.pointer), 1139 tempus, status); 1140 he = variableNames.nextElement(p); 1141 } 1142 } 1143 variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed 1144 1145 // Index the rules 1146 if (compoundFilter != nullptr) { 1147 if ((direction == UTRANS_FORWARD && compoundFilterOffset != 1) || 1148 (direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) { 1149 status = U_MISPLACED_COMPOUND_FILTER; 1150 } 1151 } 1152 1153 for (i = 0; i < dataVectorSize; i++) { 1154 TransliterationRuleData* data = static_cast<TransliterationRuleData*>(dataVector.elementAt(i)); 1155 data->ruleSet.freeze(parseError, status); 1156 } 1157 if (idBlockVector.size() == 1 && static_cast<UnicodeString*>(idBlockVector.elementAt(0))->isEmpty()) { 1158 idBlockVector.removeElementAt(0); 1159 } 1160 } 1161 } 1162 1163 /** 1164 * Set the variable range to [start, end] (inclusive). 1165 */ 1166 void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) { 1167 if (start > end || start < 0 || end > 0xFFFF) { 1168 status = U_MALFORMED_PRAGMA; 1169 return; 1170 } 1171 1172 curData->variablesBase = static_cast<char16_t>(start); 1173 if (dataVector.size() == 0) { 1174 variableNext = static_cast<char16_t>(start); 1175 variableLimit = static_cast<char16_t>(end + 1); 1176 } 1177 } 1178 1179 /** 1180 * Assert that the given character is NOT within the variable range. 1181 * If it is, return false. This is necessary to ensure that the 1182 * variable range does not overlap characters used in a rule. 1183 */ 1184 UBool TransliteratorParser::checkVariableRange(UChar32 ch) const { 1185 return !(ch >= curData->variablesBase && ch < variableLimit); 1186 } 1187 1188 /** 1189 * Set the maximum backup to 'backup', in response to a pragma 1190 * statement. 1191 */ 1192 void TransliteratorParser::pragmaMaximumBackup(int32_t /*backup*/) { 1193 //TODO Finish 1194 } 1195 1196 /** 1197 * Begin normalizing all rules using the given mode, in response 1198 * to a pragma statement. 1199 */ 1200 void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /*mode*/) { 1201 //TODO Finish 1202 } 1203 1204 static const char16_t PRAGMA_USE[] = {0x75,0x73,0x65,0x20,0}; // "use " 1205 1206 static const char16_t PRAGMA_VARIABLE_RANGE[] = {0x7E,0x76,0x61,0x72,0x69,0x61,0x62,0x6C,0x65,0x20,0x72,0x61,0x6E,0x67,0x65,0x20,0x23,0x20,0x23,0x7E,0x3B,0}; // "~variable range # #~;" 1207 1208 static const char16_t PRAGMA_MAXIMUM_BACKUP[] = {0x7E,0x6D,0x61,0x78,0x69,0x6D,0x75,0x6D,0x20,0x62,0x61,0x63,0x6B,0x75,0x70,0x20,0x23,0x7E,0x3B,0}; // "~maximum backup #~;" 1209 1210 static const char16_t PRAGMA_NFD_RULES[] = {0x7E,0x6E,0x66,0x64,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfd rules~;" 1211 1212 static const char16_t PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C,0x65,0x73,0x7E,0x3B,0}; // "~nfc rules~;" 1213 1214 /** 1215 * Return true if the given rule looks like a pragma. 1216 * @param pos offset to the first non-whitespace character 1217 * of the rule. 1218 * @param limit pointer past the last character of the rule. 1219 */ 1220 UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) { 1221 // Must start with /use\s/i 1222 return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_USE, 4), nullptr) >= 0; 1223 } 1224 1225 /** 1226 * Parse a pragma. This method assumes resemblesPragma() has 1227 * already returned true. 1228 * @param pos offset to the first non-whitespace character 1229 * of the rule. 1230 * @param limit pointer past the last character of the rule. 1231 * @return the position index after the final ';' of the pragma, 1232 * or -1 on failure. 1233 */ 1234 int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1235 int32_t array[2]; 1236 1237 // resemblesPragma() has already returned true, so we 1238 // know that pos points to /use\s/i; we can skip 4 characters 1239 // immediately 1240 pos += 4; 1241 1242 // Here are the pragmas we recognize: 1243 // use variable range 0xE000 0xEFFF; 1244 // use maximum backup 16; 1245 // use nfd rules; 1246 // use nfc rules; 1247 int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_VARIABLE_RANGE, -1), array); 1248 if (p >= 0) { 1249 setVariableRange(array[0], array[1], status); 1250 return p; 1251 } 1252 1253 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_MAXIMUM_BACKUP, -1), array); 1254 if (p >= 0) { 1255 pragmaMaximumBackup(array[0]); 1256 return p; 1257 } 1258 1259 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_NFD_RULES, -1), nullptr); 1260 if (p >= 0) { 1261 pragmaNormalizeRules(UNORM_NFD); 1262 return p; 1263 } 1264 1265 p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(true, PRAGMA_NFC_RULES, -1), nullptr); 1266 if (p >= 0) { 1267 pragmaNormalizeRules(UNORM_NFC); 1268 return p; 1269 } 1270 1271 // Syntax error: unable to parse pragma 1272 return -1; 1273 } 1274 1275 /** 1276 * MAIN PARSER. Parse the next rule in the given rule string, starting 1277 * at pos. Return the index after the last character parsed. Do not 1278 * parse characters at or after limit. 1279 * 1280 * Important: The character at pos must be a non-whitespace character 1281 * that is not the comment character. 1282 * 1283 * This method handles quoting, escaping, and whitespace removal. It 1284 * parses the end-of-rule character. It recognizes context and cursor 1285 * indicators. Once it does a lexical breakdown of the rule at pos, it 1286 * creates a rule object and adds it to our rule list. 1287 */ 1288 int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) { 1289 // Locate the left side, operator, and right side 1290 int32_t start = pos; 1291 char16_t op = 0; 1292 int32_t i; 1293 1294 // Set up segments data 1295 segmentStandins.truncate(0); 1296 segmentObjects.removeAllElements(); 1297 1298 // Use pointers to automatics to make swapping possible. 1299 RuleHalf _left(*this), _right(*this); 1300 RuleHalf* left = &_left; 1301 RuleHalf* right = &_right; 1302 1303 undefinedVariableName.remove(); 1304 pos = left->parse(rule, pos, limit, status); 1305 if (U_FAILURE(status)) { 1306 return start; 1307 } 1308 1309 if (pos == limit || u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == nullptr) { 1310 return syntaxError(U_MISSING_OPERATOR, rule, start, status); 1311 } 1312 ++pos; 1313 1314 // Found an operator char. Check for forward-reverse operator. 1315 if (op == REVERSE_RULE_OP && 1316 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1317 ++pos; 1318 op = FWDREV_RULE_OP; 1319 } 1320 1321 // Translate alternate op characters. 1322 switch (op) { 1323 case ALT_FORWARD_RULE_OP: 1324 op = FORWARD_RULE_OP; 1325 break; 1326 case ALT_REVERSE_RULE_OP: 1327 op = REVERSE_RULE_OP; 1328 break; 1329 case ALT_FWDREV_RULE_OP: 1330 op = FWDREV_RULE_OP; 1331 break; 1332 } 1333 1334 pos = right->parse(rule, pos, limit, status); 1335 if (U_FAILURE(status)) { 1336 return start; 1337 } 1338 1339 if (pos < limit) { 1340 if (rule.charAt(--pos) == END_OF_RULE) { 1341 ++pos; 1342 } else { 1343 // RuleHalf parser must have terminated at an operator 1344 return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status); 1345 } 1346 } 1347 1348 if (op == VARIABLE_DEF_OP) { 1349 // LHS is the name. RHS is a single character, either a literal 1350 // or a set (already parsed). If RHS is longer than one 1351 // character, it is either a multi-character string, or multiple 1352 // sets, or a mixture of chars and sets -- syntax error. 1353 1354 // We expect to see a single undefined variable (the one being 1355 // defined). 1356 if (undefinedVariableName.length() == 0) { 1357 // "Missing '$' or duplicate definition" 1358 return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status); 1359 } 1360 if (left->text.length() != 1 || left->text.charAt(0) != variableLimit) { 1361 // "Malformed LHS" 1362 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1363 } 1364 if (left->anchorStart || left->anchorEnd || 1365 right->anchorStart || right->anchorEnd) { 1366 return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status); 1367 } 1368 // We allow anything on the right, including an empty string. 1369 LocalPointer<UnicodeString> value(new UnicodeString(right->text), status); 1370 // nullptr pointer check 1371 if (U_FAILURE(status)) { 1372 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1373 } 1374 variableNames.put(undefinedVariableName, value.orphan(), status); 1375 ++variableLimit; 1376 return pos; 1377 } 1378 1379 // If this is not a variable definition rule, we shouldn't have 1380 // any undefined variable names. 1381 if (undefinedVariableName.length() != 0) { 1382 return syntaxError(// "Undefined variable $" + undefinedVariableName, 1383 U_UNDEFINED_VARIABLE, 1384 rule, start, status); 1385 } 1386 1387 // Verify segments 1388 if (segmentStandins.length() > segmentObjects.size()) { 1389 syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status); 1390 } 1391 for (i=0; i<segmentStandins.length(); ++i) { 1392 if (segmentStandins.charAt(i) == 0) { 1393 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1394 } 1395 } 1396 for (i=0; i<segmentObjects.size(); ++i) { 1397 if (segmentObjects.elementAt(i) == nullptr) { 1398 syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen 1399 } 1400 } 1401 1402 // If the direction we want doesn't match the rule 1403 // direction, do nothing. 1404 if (op != FWDREV_RULE_OP && 1405 ((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) { 1406 return pos; 1407 } 1408 1409 // Transform the rule into a forward rule by swapping the 1410 // sides if necessary. 1411 if (direction == UTRANS_REVERSE) { 1412 left = &_right; 1413 right = &_left; 1414 } 1415 1416 // Remove non-applicable elements in forward-reverse 1417 // rules. Bidirectional rules ignore elements that do not 1418 // apply. 1419 if (op == FWDREV_RULE_OP) { 1420 right->removeContext(); 1421 left->cursor = -1; 1422 left->cursorOffset = 0; 1423 } 1424 1425 // Normalize context 1426 if (left->ante < 0) { 1427 left->ante = 0; 1428 } 1429 if (left->post < 0) { 1430 left->post = left->text.length(); 1431 } 1432 1433 // Context is only allowed on the input side. Cursors are only 1434 // allowed on the output side. Segment delimiters can only appear 1435 // on the left, and references on the right. Cursor offset 1436 // cannot appear without an explicit cursor. Cursor offset 1437 // cannot place the cursor outside the limits of the context. 1438 // Anchors are only allowed on the input side. 1439 if (right->ante >= 0 || right->post >= 0 || left->cursor >= 0 || 1440 (right->cursorOffset != 0 && right->cursor < 0) || 1441 // - The following two checks were used to ensure that the 1442 // - the cursor offset stayed within the ante- or postcontext. 1443 // - However, with the addition of quantifiers, we have to 1444 // - allow arbitrary cursor offsets and do runtime checking. 1445 //(right->cursorOffset > (left->text.length() - left->post)) || 1446 //(-right->cursorOffset > left->ante) || 1447 right->anchorStart || right->anchorEnd || 1448 !left->isValidInput(*this) || !right->isValidOutput(*this) || 1449 left->ante > left->post) { 1450 1451 return syntaxError(U_MALFORMED_RULE, rule, start, status); 1452 } 1453 1454 // Flatten segment objects vector to an array 1455 LocalMemory<UnicodeFunctor*> segmentsArray; 1456 if (segmentObjects.size() > 0) { 1457 segmentsArray.adoptInstead(static_cast<UnicodeFunctor**>(uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor*)))); 1458 // Null pointer check 1459 if (segmentsArray.isNull()) { 1460 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1461 } 1462 segmentObjects.toArray(reinterpret_cast<void**>(segmentsArray.getAlias())); 1463 } 1464 LocalPointer<TransliterationRule> temptr(new TransliterationRule( 1465 left->text, left->ante, left->post, 1466 right->text, right->cursor, right->cursorOffset, 1467 segmentsArray.getAlias(), 1468 segmentObjects.size(), 1469 left->anchorStart, left->anchorEnd, 1470 curData, 1471 status), status); 1472 //Null pointer check 1473 if (temptr.isValid()) { 1474 segmentsArray.orphan(); 1475 } 1476 if (U_FAILURE(status)) { 1477 return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status); 1478 } 1479 1480 curData->ruleSet.addRule(temptr.orphan(), status); 1481 1482 return pos; 1483 } 1484 1485 /** 1486 * Called by main parser upon syntax error. Search the rule string 1487 * for the probable end of the rule. Of course, if the error is that 1488 * the end of rule marker is missing, then the rule end will not be found. 1489 * In any case the rule start will be correctly reported. 1490 * @param msg error description 1491 * @param rule pattern string 1492 * @param start position of first character of current rule 1493 */ 1494 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode, 1495 const UnicodeString& rule, 1496 int32_t pos, 1497 UErrorCode& status) 1498 { 1499 parseError.offset = pos; 1500 parseError.line = 0 ; /* we are not using line numbers */ 1501 1502 // for pre-context 1503 const int32_t LEN = U_PARSE_CONTEXT_LEN - 1; 1504 int32_t start = uprv_max(pos - LEN, 0); 1505 int32_t stop = pos; 1506 1507 rule.extract(start,stop-start,parseError.preContext); 1508 //null terminate the buffer 1509 parseError.preContext[stop-start] = 0; 1510 1511 //for post-context 1512 start = pos; 1513 stop = uprv_min(pos + LEN, rule.length()); 1514 1515 rule.extract(start,stop-start,parseError.postContext); 1516 //null terminate the buffer 1517 parseError.postContext[stop-start]= 0; 1518 1519 status = parseErrorCode; 1520 return pos; 1521 1522 } 1523 1524 /** 1525 * Parse a UnicodeSet out, store it, and return the stand-in character 1526 * used to represent it. 1527 */ 1528 char16_t TransliteratorParser::parseSet(const UnicodeString& rule, 1529 ParsePosition& pos, 1530 UErrorCode& status) { 1531 UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status); 1532 // Null pointer check 1533 if (set == nullptr) { 1534 status = U_MEMORY_ALLOCATION_ERROR; 1535 return static_cast<char16_t>(0x0000); // Return empty character with error. 1536 } 1537 set->compact(); 1538 return generateStandInFor(set, status); 1539 } 1540 1541 /** 1542 * Generate and return a stand-in for a new UnicodeFunctor. Store 1543 * the matcher (adopt it). 1544 */ 1545 char16_t TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) { 1546 // assert(obj != null); 1547 1548 // Look up previous stand-in, if any. This is a short list 1549 // (typical n is 0, 1, or 2); linear search is optimal. 1550 for (int32_t i=0; i<variablesVector.size(); ++i) { 1551 if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison 1552 return static_cast<char16_t>(curData->variablesBase + i); 1553 } 1554 } 1555 1556 if (variableNext >= variableLimit) { 1557 delete adopted; 1558 status = U_VARIABLE_RANGE_EXHAUSTED; 1559 return 0; 1560 } 1561 variablesVector.addElement(adopted, status); 1562 if (U_FAILURE(status)) { 1563 delete adopted; 1564 return 0; 1565 } 1566 return variableNext++; 1567 } 1568 1569 /** 1570 * Return the standin for segment seg (1-based). 1571 */ 1572 char16_t TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) { 1573 // Special character used to indicate an empty spot 1574 char16_t empty = curData->variablesBase - 1; 1575 while (segmentStandins.length() < seg) { 1576 segmentStandins.append(empty); 1577 } 1578 char16_t c = segmentStandins.charAt(seg-1); 1579 if (c == empty) { 1580 if (variableNext >= variableLimit) { 1581 status = U_VARIABLE_RANGE_EXHAUSTED; 1582 return 0; 1583 } 1584 c = variableNext++; 1585 // Set a placeholder in the primary variables vector that will be 1586 // filled in later by setSegmentObject(). We know that we will get 1587 // called first because setSegmentObject() will call us. 1588 variablesVector.addElement((void*) nullptr, status); 1589 segmentStandins.setCharAt(seg-1, c); 1590 } 1591 return c; 1592 } 1593 1594 /** 1595 * Set the object for segment seg (1-based). 1596 */ 1597 void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) { 1598 // Since we call parseSection() recursively, nested 1599 // segments will result in segment i+1 getting parsed 1600 // and stored before segment i; be careful with the 1601 // vector handling here. 1602 if (segmentObjects.size() < seg) { 1603 segmentObjects.setSize(seg, status); 1604 } 1605 if (U_FAILURE(status)) { 1606 return; 1607 } 1608 int32_t index = getSegmentStandin(seg, status) - curData->variablesBase; 1609 if (segmentObjects.elementAt(seg-1) != nullptr || 1610 variablesVector.elementAt(index) != nullptr) { 1611 // should never happen 1612 if (U_SUCCESS(status)) {status = U_INTERNAL_TRANSLITERATOR_ERROR;} 1613 return; 1614 } 1615 // Note: neither segmentObjects or variablesVector has an object deleter function. 1616 segmentObjects.setElementAt(adopted, seg-1); 1617 variablesVector.setElementAt(adopted, index); 1618 } 1619 1620 /** 1621 * Return the stand-in for the dot set. It is allocated the first 1622 * time and reused thereafter. 1623 */ 1624 char16_t TransliteratorParser::getDotStandIn(UErrorCode& status) { 1625 if (dotStandIn == static_cast<char16_t>(-1)) { 1626 LocalPointer<UnicodeSet> tempus(new UnicodeSet(UnicodeString(true, DOT_SET, -1), status), status); 1627 // Null pointer check. 1628 if (U_FAILURE(status)) { 1629 return static_cast<char16_t>(0x0000); 1630 } 1631 dotStandIn = generateStandInFor(tempus.orphan(), status); 1632 } 1633 return dotStandIn; 1634 } 1635 1636 /** 1637 * Append the value of the given variable name to the given 1638 * UnicodeString. 1639 */ 1640 void TransliteratorParser::appendVariableDef(const UnicodeString& name, 1641 UnicodeString& buf, 1642 UErrorCode& status) { 1643 const UnicodeString* s = static_cast<const UnicodeString*>(variableNames.get(name)); 1644 if (s == nullptr) { 1645 // We allow one undefined variable so that variable definition 1646 // statements work. For the first undefined variable we return 1647 // the special placeholder variableLimit-1, and save the variable 1648 // name. 1649 if (undefinedVariableName.length() == 0) { 1650 undefinedVariableName = name; 1651 if (variableNext >= variableLimit) { 1652 // throw new RuntimeException("Private use variables exhausted"); 1653 status = U_ILLEGAL_ARGUMENT_ERROR; 1654 return; 1655 } 1656 buf.append(--variableLimit); 1657 } else { 1658 //throw new IllegalArgumentException("Undefined variable $" 1659 // + name); 1660 status = U_ILLEGAL_ARGUMENT_ERROR; 1661 return; 1662 } 1663 } else { 1664 buf.append(*s); 1665 } 1666 } 1667 1668 /** 1669 * Glue method to get around access restrictions in C++. 1670 */ 1671 /*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) { 1672 return Transliterator::createBasicInstance(id, canonID); 1673 }*/ 1674 1675 U_NAMESPACE_END 1676 1677 U_CAPI int32_t 1678 utrans_stripRules(const char16_t *source, int32_t sourceLen, char16_t *target, UErrorCode *status) { 1679 U_NAMESPACE_USE 1680 1681 //const char16_t *sourceStart = source; 1682 const char16_t *targetStart = target; 1683 const char16_t *sourceLimit = source+sourceLen; 1684 char16_t *targetLimit = target+sourceLen; 1685 UChar32 c = 0; 1686 UBool quoted = false; 1687 int32_t index; 1688 1689 uprv_memset(target, 0, sourceLen*U_SIZEOF_UCHAR); 1690 1691 /* read the rules into the buffer */ 1692 while (source < sourceLimit) 1693 { 1694 index=0; 1695 U16_NEXT_UNSAFE(source, index, c); 1696 source+=index; 1697 if(c == QUOTE) { 1698 quoted = !quoted; 1699 } 1700 else if (!quoted) { 1701 if (c == RULE_COMMENT_CHAR) { 1702 /* skip comments and all preceding spaces */ 1703 while (targetStart < target && *(target - 1) == 0x0020) { 1704 target--; 1705 } 1706 do { 1707 if (source == sourceLimit) { 1708 c = U_SENTINEL; 1709 break; 1710 } 1711 c = *(source++); 1712 } 1713 while (c != CR && c != LF); 1714 if (c < 0) { 1715 break; 1716 } 1717 } 1718 else if (c == ESCAPE && source < sourceLimit) { 1719 UChar32 c2 = *source; 1720 if (c2 == CR || c2 == LF) { 1721 /* A backslash at the end of a line. */ 1722 /* Since we're stripping lines, ignore the backslash. */ 1723 source++; 1724 continue; 1725 } 1726 if (c2 == 0x0075 && source+5 < sourceLimit) { /* \u seen. \U isn't unescaped. */ 1727 int32_t escapeOffset = 0; 1728 UnicodeString escapedStr(source, 5); 1729 c2 = escapedStr.unescapeAt(escapeOffset); 1730 1731 if (c2 == (UChar32)0xFFFFFFFF || escapeOffset == 0) 1732 { 1733 *status = U_PARSE_ERROR; 1734 return 0; 1735 } 1736 if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) { 1737 /* It was escaped for a reason. Write what it was suppose to be. */ 1738 source+=5; 1739 c = c2; 1740 } 1741 } 1742 else if (c2 == QUOTE) { 1743 /* \' seen. Make sure we don't do anything when we see it again. */ 1744 quoted = !quoted; 1745 } 1746 } 1747 } 1748 if (c == CR || c == LF) 1749 { 1750 /* ignore spaces carriage returns, and all leading spaces on the next line. 1751 * and line feed unless in the form \uXXXX 1752 */ 1753 quoted = false; 1754 while (source < sourceLimit) { 1755 c = *(source); 1756 if (c != CR && c != LF && c != 0x0020) { 1757 break; 1758 } 1759 source++; 1760 } 1761 continue; 1762 } 1763 1764 /* Append char16_t * after dissembling if c > 0xffff*/ 1765 index=0; 1766 U16_APPEND_UNSAFE(target, index, c); 1767 target+=index; 1768 } 1769 if (target < targetLimit) { 1770 *target = 0; 1771 } 1772 return (int32_t)(target-targetStart); 1773 } 1774 1775 #endif /* #if !UCONFIG_NO_TRANSLITERATION */