nfrule.cpp (67573B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1997-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ****************************************************************************** 8 * file name: nfrule.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * Modification history 14 * Date Name Comments 15 * 10/11/2001 Doug Ported from ICU4J 16 */ 17 18 #include "nfrule.h" 19 20 #if U_HAVE_RBNF 21 22 #include "unicode/localpointer.h" 23 #include "unicode/rbnf.h" 24 #include "unicode/tblcoll.h" 25 #include "unicode/plurfmt.h" 26 #include "unicode/upluralrules.h" 27 #include "unicode/coleitr.h" 28 #include "unicode/uchar.h" 29 #include "nfrs.h" 30 #include "nfrlist.h" 31 #include "nfsubs.h" 32 #include "patternprops.h" 33 #include "putilimp.h" 34 35 U_NAMESPACE_BEGIN 36 37 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf, const UnicodeString &_ruleText, UErrorCode &status) 38 : baseValue(static_cast<int32_t>(0)) 39 , radix(10) 40 , exponent(0) 41 , decimalPoint(0) 42 , fRuleText(_ruleText) 43 , sub1(nullptr) 44 , sub2(nullptr) 45 , formatter(_rbnf) 46 , rulePatternFormat(nullptr) 47 { 48 if (!fRuleText.isEmpty()) { 49 parseRuleDescriptor(fRuleText, status); 50 } 51 } 52 53 NFRule::~NFRule() 54 { 55 if (sub1 != sub2) { 56 delete sub2; 57 sub2 = nullptr; 58 } 59 delete sub1; 60 sub1 = nullptr; 61 delete rulePatternFormat; 62 rulePatternFormat = nullptr; 63 } 64 65 static const char16_t gLeftBracket = 0x005b; 66 static const char16_t gRightBracket = 0x005d; 67 static const char16_t gVerticalLine = 0x007C; 68 static const char16_t gColon = 0x003a; 69 static const char16_t gZero = 0x0030; 70 static const char16_t gNine = 0x0039; 71 static const char16_t gSpace = 0x0020; 72 static const char16_t gSlash = 0x002f; 73 static const char16_t gGreaterThan = 0x003e; 74 static const char16_t gLessThan = 0x003c; 75 static const char16_t gComma = 0x002c; 76 static const char16_t gDot = 0x002e; 77 static const char16_t gTick = 0x0027; 78 //static const char16_t gMinus = 0x002d; 79 static const char16_t gSemicolon = 0x003b; 80 static const char16_t gX = 0x0078; 81 82 static const char16_t gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ 83 static const char16_t gInf[] = {0x49, 0x6E, 0x66, 0}; /* "Inf" */ 84 static const char16_t gNaN[] = {0x4E, 0x61, 0x4E, 0}; /* "NaN" */ 85 86 static const char16_t gDollarOpenParenthesis[] = {0x24, 0x28, 0}; /* "$(" */ 87 static const char16_t gClosedParenthesisDollar[] = {0x29, 0x24, 0}; /* ")$" */ 88 89 static const char16_t gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ 90 static const char16_t gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ 91 static const char16_t gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ 92 static const char16_t gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ 93 static const char16_t gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ 94 static const char16_t gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ 95 static const char16_t gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ 96 static const char16_t gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ 97 static const char16_t gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ 98 static const char16_t gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ 99 static const char16_t gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ 100 static const char16_t gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ 101 102 static const char16_t * const RULE_PREFIXES[] = { 103 gLessLess, gLessPercent, gLessHash, gLessZero, 104 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, 105 gEqualPercent, gEqualHash, gEqualZero, nullptr 106 }; 107 108 void 109 NFRule::makeRules(UnicodeString& description, 110 NFRuleSet *owner, 111 const NFRule *predecessor, 112 const RuleBasedNumberFormat *rbnf, 113 NFRuleList& rules, 114 UErrorCode& status) 115 { 116 if (U_FAILURE(status)) { 117 return; 118 } 119 // we know we're making at least one rule, so go ahead and 120 // new it up and initialize its basevalue and divisor 121 // (this also strips the rule descriptor, if any, off the 122 // description string) 123 LocalPointer<NFRule> rule1(new NFRule(rbnf, description, status)); 124 if (U_FAILURE(status)) { 125 return; 126 } 127 /* test for nullptr */ 128 if (rule1.isNull()) { 129 status = U_MEMORY_ALLOCATION_ERROR; 130 return; 131 } 132 description = rule1->fRuleText; 133 134 // check the description to see whether there's text enclosed 135 // in brackets 136 int32_t brack1 = description.indexOf(gLeftBracket); 137 int32_t brack2 = brack1 < 0 ? -1 : description.indexOf(gRightBracket); 138 139 // if the description doesn't contain a matched pair of brackets, 140 // or if it's of a type that doesn't recognize bracketed text, 141 // then leave the description alone, initialize the rule's 142 // rule text and substitutions, and return that rule 143 if (brack2 < 0 || brack1 > brack2 144 || rule1->getType() == kProperFractionRule 145 || rule1->getType() == kNegativeNumberRule 146 || rule1->getType() == kInfinityRule 147 || rule1->getType() == kNaNRule) 148 { 149 rule1->extractSubstitutions(owner, description, predecessor, status); 150 if (U_FAILURE(status)) { 151 return; 152 } 153 } 154 else { 155 // if the description does contain a matched pair of brackets, 156 // then it's really shorthand for two rules (with one exception) 157 LocalPointer<NFRule> rule2; 158 UnicodeString sbuf; 159 int32_t orElseOp = description.indexOf(gVerticalLine); 160 161 // we'll actually only split the rule into two rules if its 162 // base value is an even multiple of its divisor (or it's one 163 // of the special rules) 164 if ((rule1->baseValue > 0 165 && (rule1->radix != 0) // ICU-23109 Ensure next line won't "% 0" 166 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) 167 || rule1->getType() == kImproperFractionRule 168 || rule1->getType() == kDefaultRule) { 169 170 // if it passes that test, new up the second rule. If the 171 // rule set both rules will belong to is a fraction rule 172 // set, they both have the same base value; otherwise, 173 // increment the original rule's base value ("rule1" actually 174 // goes SECOND in the rule set's rule list) 175 rule2.adoptInstead(new NFRule(rbnf, UnicodeString(), status)); 176 if (U_FAILURE(status)) { 177 return; 178 } 179 /* test for nullptr */ 180 if (rule2.isNull()) { 181 status = U_MEMORY_ALLOCATION_ERROR; 182 return; 183 } 184 if (rule1->baseValue >= 0) { 185 rule2->baseValue = rule1->baseValue; 186 if (!owner->isFractionRuleSet()) { 187 ++rule1->baseValue; 188 } 189 } 190 191 // if the description began with "x.x" and contains bracketed 192 // text, it describes both the improper fraction rule and 193 // the proper fraction rule 194 else if (rule1->getType() == kImproperFractionRule) { 195 rule2->setType(kProperFractionRule); 196 } 197 198 // if the description began with "x.0" and contains bracketed 199 // text, it describes both the default rule and the 200 // improper fraction rule 201 else if (rule1->getType() == kDefaultRule) { 202 rule2->baseValue = rule1->baseValue; 203 rule1->setType(kImproperFractionRule); 204 } 205 206 // both rules have the same radix and exponent (i.e., the 207 // same divisor) 208 rule2->radix = rule1->radix; 209 rule2->exponent = rule1->exponent; 210 211 // By default, rule2's rule text omits the stuff in brackets, 212 // unless it contains a | between the brackets. 213 // Initialize its rule text and substitutions accordingly. 214 sbuf.append(description, 0, brack1); 215 if (orElseOp >= 0) { 216 sbuf.append(description, orElseOp + 1, brack2 - orElseOp - 1); 217 } 218 if (brack2 + 1 < description.length()) { 219 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 220 } 221 rule2->extractSubstitutions(owner, sbuf, predecessor, status); 222 if (U_FAILURE(status)) { 223 return; 224 } 225 } 226 227 // rule1's text includes the text in the brackets but omits 228 // the brackets themselves: initialize _its_ rule text and 229 // substitutions accordingly 230 sbuf.setTo(description, 0, brack1); 231 if (orElseOp >= 0) { 232 sbuf.append(description, brack1 + 1, orElseOp - brack1 - 1); 233 } 234 else { 235 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); 236 } 237 if (brack2 + 1 < description.length()) { 238 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 239 } 240 rule1->extractSubstitutions(owner, sbuf, predecessor, status); 241 if (U_FAILURE(status)) { 242 return; 243 } 244 245 // if we only have one rule, return it; if we have two, return 246 // a two-element array containing them (notice that rule2 goes 247 // BEFORE rule1 in the list: in all cases, rule2 OMITS the 248 // material in the brackets and rule1 INCLUDES the material 249 // in the brackets) 250 if (!rule2.isNull()) { 251 if (rule2->baseValue >= kNoBase) { 252 rules.add(rule2.orphan()); 253 } 254 else { 255 owner->setNonNumericalRule(rule2.orphan()); 256 } 257 } 258 } 259 if (rule1->baseValue >= kNoBase) { 260 rules.add(rule1.orphan()); 261 } 262 else { 263 owner->setNonNumericalRule(rule1.orphan()); 264 } 265 } 266 267 /** 268 * This function parses the rule's rule descriptor (i.e., the base 269 * value and/or other tokens that precede the rule's rule text 270 * in the description) and sets the rule's base value, radix, and 271 * exponent according to the descriptor. (If the description doesn't 272 * include a rule descriptor, then this function sets everything to 273 * default values and the rule set sets the rule's real base value). 274 * @param description The rule's description 275 * @return If "description" included a rule descriptor, this is 276 * "description" with the descriptor and any trailing whitespace 277 * stripped off. Otherwise; it's "descriptor" unchangd. 278 */ 279 void 280 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) 281 { 282 // the description consists of a rule descriptor and a rule body, 283 // separated by a colon. The rule descriptor is optional. If 284 // it's omitted, just set the base value to 0. 285 int32_t p = description.indexOf(gColon); 286 if (p != -1) { 287 // copy the descriptor out into its own string and strip it, 288 // along with any trailing whitespace, out of the original 289 // description 290 UnicodeString descriptor; 291 descriptor.setTo(description, 0, p); 292 293 ++p; 294 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { 295 ++p; 296 } 297 description.removeBetween(0, p); 298 299 // check first to see if the rule descriptor matches the token 300 // for one of the special rules. If it does, set the base 301 // value to the correct identifier value 302 int descriptorLength = descriptor.length(); 303 char16_t firstChar = descriptor.charAt(0); 304 char16_t lastChar = descriptor.charAt(descriptorLength - 1); 305 if (firstChar >= gZero && firstChar <= gNine && lastChar != gX) { 306 // if the rule descriptor begins with a digit, it's a descriptor 307 // for a normal rule 308 // since we don't have Long.parseLong, and this isn't much work anyway, 309 // just build up the value as we encounter the digits. 310 int64_t val = 0; 311 p = 0; 312 char16_t c = gSpace; 313 314 // begin parsing the descriptor: copy digits 315 // into "tempValue", skip periods, commas, and spaces, 316 // stop on a slash or > sign (or at the end of the string), 317 // and throw an exception on any other character 318 while (p < descriptorLength) { 319 c = descriptor.charAt(p); 320 if (c >= gZero && c <= gNine) { 321 int64_t digit = static_cast<int64_t>(c - gZero); 322 if ((val > 0 && val > (INT64_MAX - digit) / 10) || 323 (val < 0 && val < (INT64_MIN - digit) / 10)) { 324 // out of int64_t range 325 status = U_PARSE_ERROR; 326 return; 327 } 328 val = val * 10 + digit; 329 } 330 else if (c == gSlash || c == gGreaterThan) { 331 break; 332 } 333 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 334 } 335 else { 336 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 337 status = U_PARSE_ERROR; 338 return; 339 } 340 ++p; 341 } 342 343 // we have the base value, so set it 344 setBaseValue(val, status); 345 346 // if we stopped the previous loop on a slash, we're 347 // now parsing the rule's radix. Again, accumulate digits 348 // in tempValue, skip punctuation, stop on a > mark, and 349 // throw an exception on anything else 350 if (c == gSlash) { 351 val = 0; 352 ++p; 353 while (p < descriptorLength) { 354 c = descriptor.charAt(p); 355 if (c >= gZero && c <= gNine) { 356 int64_t digit = static_cast<int64_t>(c - gZero); 357 if ((val > 0 && val > (INT64_MAX - digit) / 10) || 358 (val < 0 && val < (INT64_MIN - digit) / 10)) { 359 // out of int64_t range 360 status = U_PARSE_ERROR; 361 return; 362 } 363 val = val * 10 + digit; 364 } 365 else if (c == gGreaterThan) { 366 break; 367 } 368 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 369 } 370 else { 371 // throw new IllegalArgumentException("Illegal character is rule descriptor"); 372 status = U_PARSE_ERROR; 373 return; 374 } 375 ++p; 376 } 377 378 // tempValue now contain's the rule's radix. Set it 379 // accordingly, and recalculate the rule's exponent 380 radix = static_cast<int32_t>(val); 381 if (radix == 0) { 382 // throw new IllegalArgumentException("Rule can't have radix of 0"); 383 status = U_PARSE_ERROR; 384 } 385 386 exponent = expectedExponent(); 387 } 388 389 // if we stopped the previous loop on a > sign, then continue 390 // for as long as we still see > signs. For each one, 391 // decrement the exponent (unless the exponent is already 0). 392 // If we see another character before reaching the end of 393 // the descriptor, that's also a syntax error. 394 if (c == gGreaterThan) { 395 while (p < descriptor.length()) { 396 c = descriptor.charAt(p); 397 if (c == gGreaterThan && exponent > 0) { 398 --exponent; 399 } else { 400 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 401 status = U_PARSE_ERROR; 402 return; 403 } 404 ++p; 405 } 406 } 407 } 408 else if (0 == descriptor.compare(gMinusX, 2)) { 409 setType(kNegativeNumberRule); 410 } 411 else if (descriptorLength == 3) { 412 if (firstChar == gZero && lastChar == gX) { 413 setBaseValue(kProperFractionRule, status); 414 decimalPoint = descriptor.charAt(1); 415 } 416 else if (firstChar == gX && lastChar == gX) { 417 setBaseValue(kImproperFractionRule, status); 418 decimalPoint = descriptor.charAt(1); 419 } 420 else if (firstChar == gX && lastChar == gZero) { 421 setBaseValue(kDefaultRule, status); 422 decimalPoint = descriptor.charAt(1); 423 } 424 else if (descriptor.compare(gNaN, 3) == 0) { 425 setBaseValue(kNaNRule, status); 426 } 427 else if (descriptor.compare(gInf, 3) == 0) { 428 setBaseValue(kInfinityRule, status); 429 } 430 } 431 } 432 // else use the default base value for now. 433 434 // finally, if the rule body begins with an apostrophe, strip it off 435 // (this is generally used to put whitespace at the beginning of 436 // a rule's rule text) 437 if (!description.isEmpty() && description.charAt(0) == gTick) { 438 description.removeBetween(0, 1); 439 } 440 441 // return the description with all the stuff we've just waded through 442 // stripped off the front. It now contains just the rule body. 443 // return description; 444 } 445 446 /** 447 * Searches the rule's rule text for the substitution tokens, 448 * creates the substitutions, and removes the substitution tokens 449 * from the rule's rule text. 450 * @param owner The rule set containing this rule 451 * @param predecessor The rule preseding this one in "owners" rule list 452 * @param ownersOwner The RuleBasedFormat that owns this rule 453 */ 454 void 455 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, 456 const UnicodeString &ruleText, 457 const NFRule* predecessor, 458 UErrorCode& status) 459 { 460 if (U_FAILURE(status)) { 461 return; 462 } 463 fRuleText = ruleText; 464 sub1 = extractSubstitution(ruleSet, predecessor, status); 465 if (sub1 == nullptr) { 466 // Small optimization. There is no need to create a redundant NullSubstitution. 467 sub2 = nullptr; 468 } 469 else { 470 sub2 = extractSubstitution(ruleSet, predecessor, status); 471 } 472 int32_t pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0); 473 int32_t pluralRuleEnd = (pluralRuleStart >= 0 ? fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) : -1); 474 if (pluralRuleEnd >= 0) { 475 int32_t endType = fRuleText.indexOf(gComma, pluralRuleStart); 476 if (endType < 0) { 477 status = U_PARSE_ERROR; 478 return; 479 } 480 UnicodeString type(fRuleText.tempSubString(pluralRuleStart + 2, endType - pluralRuleStart - 2)); 481 UPluralType pluralType; 482 if (type.startsWith(UNICODE_STRING_SIMPLE("cardinal"))) { 483 pluralType = UPLURAL_TYPE_CARDINAL; 484 } 485 else if (type.startsWith(UNICODE_STRING_SIMPLE("ordinal"))) { 486 pluralType = UPLURAL_TYPE_ORDINAL; 487 } 488 else { 489 status = U_ILLEGAL_ARGUMENT_ERROR; 490 return; 491 } 492 rulePatternFormat = formatter->createPluralFormat(pluralType, 493 fRuleText.tempSubString(endType + 1, pluralRuleEnd - endType - 1), status); 494 } 495 } 496 497 /** 498 * Searches the rule's rule text for the first substitution token, 499 * creates a substitution based on it, and removes the token from 500 * the rule's rule text. 501 * @param owner The rule set containing this rule 502 * @param predecessor The rule preceding this one in the rule set's 503 * rule list 504 * @param ownersOwner The RuleBasedNumberFormat that owns this rule 505 * @return The newly-created substitution. This is never null; if 506 * the rule text doesn't contain any substitution tokens, this will 507 * be a NullSubstitution. 508 */ 509 NFSubstitution * 510 NFRule::extractSubstitution(const NFRuleSet* ruleSet, 511 const NFRule* predecessor, 512 UErrorCode& status) 513 { 514 NFSubstitution* result = nullptr; 515 516 // search the rule's rule text for the first two characters of 517 // a substitution token 518 int32_t subStart = indexOfAnyRulePrefix(); 519 int32_t subEnd = subStart; 520 521 // if we didn't find one, create a null substitution positioned 522 // at the end of the rule text 523 if (subStart == -1) { 524 return nullptr; 525 } 526 527 // special-case the ">>>" token, since searching for the > at the 528 // end will actually find the > in the middle 529 if (fRuleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { 530 subEnd = subStart + 2; 531 532 // otherwise the substitution token ends with the same character 533 // it began with 534 } else { 535 char16_t c = fRuleText.charAt(subStart); 536 subEnd = fRuleText.indexOf(c, subStart + 1); 537 // special case for '<%foo<<' 538 if (c == gLessThan && subEnd != -1 && subEnd < fRuleText.length() - 1 && fRuleText.charAt(subEnd+1) == c) { 539 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle 540 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack 541 // to get around this. Having the duplicate at the front would cause problems with 542 // rules like "<<%" to format, say, percents... 543 ++subEnd; 544 } 545 } 546 547 // if we don't find the end of the token (i.e., if we're on a single, 548 // unmatched token character), create a null substitution positioned 549 // at the end of the rule 550 if (subEnd == -1) { 551 return nullptr; 552 } 553 554 // if we get here, we have a real substitution token (or at least 555 // some text bounded by substitution token characters). Use 556 // makeSubstitution() to create the right kind of substitution 557 UnicodeString subToken; 558 subToken.setTo(fRuleText, subStart, subEnd + 1 - subStart); 559 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, 560 this->formatter, subToken, status); 561 562 // remove the substitution from the rule text 563 fRuleText.removeBetween(subStart, subEnd+1); 564 565 return result; 566 } 567 568 /** 569 * Sets the rule's base value, and causes the radix and exponent 570 * to be recalculated. This is used during construction when we 571 * don't know the rule's base value until after it's been 572 * constructed. It should be used at any other time. 573 * @param The new base value for the rule. 574 */ 575 void 576 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) 577 { 578 // set the base value 579 baseValue = newBaseValue; 580 radix = 10; 581 582 // if this isn't a special rule, recalculate the radix and exponent 583 // (the radix always defaults to 10; if it's supposed to be something 584 // else, it's cleaned up by the caller and the exponent is 585 // recalculated again-- the only function that does this is 586 // NFRule.parseRuleDescriptor() ) 587 if (baseValue >= 1) { 588 exponent = expectedExponent(); 589 590 // this function gets called on a fully-constructed rule whose 591 // description didn't specify a base value. This means it 592 // has substitutions, and some substitutions hold on to copies 593 // of the rule's divisor. Fix their copies of the divisor. 594 if (sub1 != nullptr) { 595 sub1->setDivisor(radix, exponent, status); 596 } 597 if (sub2 != nullptr) { 598 sub2->setDivisor(radix, exponent, status); 599 } 600 601 // if this is a special rule, its radix and exponent are basically 602 // ignored. Set them to "safe" default values 603 } else { 604 exponent = 0; 605 } 606 } 607 608 /** 609 * This calculates the rule's exponent based on its radix and base 610 * value. This will be the highest power the radix can be raised to 611 * and still produce a result less than or equal to the base value. 612 */ 613 int16_t 614 NFRule::expectedExponent() const 615 { 616 // since the log of 0, or the log base 0 of something, causes an 617 // error, declare the exponent in these cases to be 0 (we also 618 // deal with the special-rule identifiers here) 619 if (radix == 0 || baseValue < 1) { 620 return 0; 621 } 622 623 // we get rounding error in some cases-- for example, log 1000 / log 10 624 // gives us 1.9999999996 instead of 2. The extra logic here is to take 625 // that into account 626 int16_t tempResult = static_cast<int16_t>(uprv_log(static_cast<double>(baseValue)) / 627 uprv_log(static_cast<double>(radix))); 628 int64_t temp = util64_pow(radix, tempResult + 1); 629 if (temp <= baseValue) { 630 tempResult += 1; 631 } 632 return tempResult; 633 } 634 635 /** 636 * Searches the rule's rule text for any of the specified strings. 637 * @return The index of the first match in the rule's rule text 638 * (i.e., the first substring in the rule's rule text that matches 639 * _any_ of the strings in "strings"). If none of the strings in 640 * "strings" is found in the rule's rule text, returns -1. 641 */ 642 int32_t 643 NFRule::indexOfAnyRulePrefix() const 644 { 645 int result = -1; 646 for (int i = 0; RULE_PREFIXES[i]; i++) { 647 int32_t pos = fRuleText.indexOf(*RULE_PREFIXES[i]); 648 if (pos != -1 && (result == -1 || pos < result)) { 649 result = pos; 650 } 651 } 652 return result; 653 } 654 655 //----------------------------------------------------------------------- 656 // boilerplate 657 //----------------------------------------------------------------------- 658 659 static UBool 660 util_equalSubstitutions(const NFSubstitution* sub1, const NFSubstitution* sub2) 661 { 662 if (sub1) { 663 if (sub2) { 664 return *sub1 == *sub2; 665 } 666 } else if (!sub2) { 667 return true; 668 } 669 return false; 670 } 671 672 /** 673 * Tests two rules for equality. 674 * @param that The rule to compare this one against 675 * @return True is the two rules are functionally equivalent 676 */ 677 bool 678 NFRule::operator==(const NFRule& rhs) const 679 { 680 return baseValue == rhs.baseValue 681 && radix == rhs.radix 682 && exponent == rhs.exponent 683 && fRuleText == rhs.fRuleText 684 && util_equalSubstitutions(sub1, rhs.sub1) 685 && util_equalSubstitutions(sub2, rhs.sub2); 686 } 687 688 /** 689 * Returns a textual representation of the rule. This won't 690 * necessarily be the same as the description that this rule 691 * was created with, but it will produce the same result. 692 * @return A textual description of the rule 693 */ 694 static void util_append64(UnicodeString& result, int64_t n) 695 { 696 char16_t buffer[256]; 697 int32_t len = util64_tou(n, buffer, sizeof(buffer)); 698 UnicodeString temp(buffer, len); 699 result.append(temp); 700 } 701 702 void 703 NFRule::_appendRuleText(UnicodeString& result) const 704 { 705 switch (getType()) { 706 case kNegativeNumberRule: result.append(gMinusX, 2); break; 707 case kImproperFractionRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break; 708 case kProperFractionRule: result.append(gZero).append(decimalPoint == 0 ? gDot : decimalPoint).append(gX); break; 709 case kDefaultRule: result.append(gX).append(decimalPoint == 0 ? gDot : decimalPoint).append(gZero); break; 710 case kInfinityRule: result.append(gInf, 3); break; 711 case kNaNRule: result.append(gNaN, 3); break; 712 default: 713 // for a normal rule, write out its base value, and if the radix is 714 // something other than 10, write out the radix (with the preceding 715 // slash, of course). Then calculate the expected exponent and if 716 // if isn't the same as the actual exponent, write an appropriate 717 // number of > signs. Finally, terminate the whole thing with 718 // a colon. 719 util_append64(result, baseValue); 720 if (radix != 10) { 721 result.append(gSlash); 722 util_append64(result, radix); 723 } 724 int numCarets = expectedExponent() - exponent; 725 for (int i = 0; i < numCarets; i++) { 726 result.append(gGreaterThan); 727 } 728 break; 729 } 730 result.append(gColon); 731 result.append(gSpace); 732 733 // if the rule text begins with a space, write an apostrophe 734 // (whitespace after the rule descriptor is ignored; the 735 // apostrophe is used to make the whitespace significant) 736 if (fRuleText.charAt(0) == gSpace && (sub1 == nullptr || sub1->getPos() != 0)) { 737 result.append(gTick); 738 } 739 740 // now, write the rule's rule text, inserting appropriate 741 // substitution tokens in the appropriate places 742 UnicodeString ruleTextCopy; 743 ruleTextCopy.setTo(fRuleText); 744 745 UnicodeString temp; 746 if (sub2 != nullptr) { 747 sub2->toString(temp); 748 ruleTextCopy.insert(sub2->getPos(), temp); 749 } 750 if (sub1 != nullptr) { 751 sub1->toString(temp); 752 ruleTextCopy.insert(sub1->getPos(), temp); 753 } 754 755 result.append(ruleTextCopy); 756 757 // and finally, top the whole thing off with a semicolon and 758 // return the result 759 result.append(gSemicolon); 760 } 761 762 int64_t NFRule::getDivisor() const 763 { 764 return util64_pow(radix, exponent); 765 } 766 767 /** 768 * Internal function to facilitate numerical rounding. See the explanation in MultiplierSubstitution::transformNumber(). 769 */ 770 bool NFRule::hasModulusSubstitution() const 771 { 772 return (sub1 != nullptr && sub1->isModulusSubstitution()) || (sub2 != nullptr && sub2->isModulusSubstitution()); 773 } 774 775 776 //----------------------------------------------------------------------- 777 // formatting 778 //----------------------------------------------------------------------- 779 780 /** 781 * Formats the number, and inserts the resulting text into 782 * toInsertInto. 783 * @param number The number being formatted 784 * @param toInsertInto The string where the resultant text should 785 * be inserted 786 * @param pos The position in toInsertInto where the resultant text 787 * should be inserted 788 */ 789 void 790 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const 791 { 792 // first, insert the rule's rule text into toInsertInto at the 793 // specified position, then insert the results of the substitutions 794 // into the right places in toInsertInto (notice we do the 795 // substitutions in reverse order so that the offsets don't get 796 // messed up) 797 int32_t pluralRuleStart = fRuleText.length(); 798 int32_t lengthOffset = 0; 799 if (!rulePatternFormat) { 800 toInsertInto.insert(pos, fRuleText); 801 } 802 else { 803 pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0); 804 int pluralRuleEnd = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 805 int initialLength = toInsertInto.length(); 806 if (pluralRuleEnd < fRuleText.length() - 1) { 807 toInsertInto.insert(pos, fRuleText.tempSubString(pluralRuleEnd + 2)); 808 } 809 toInsertInto.insert(pos, 810 rulePatternFormat->format(static_cast<int32_t>(number / util64_pow(radix, exponent)), status)); 811 if (pluralRuleStart > 0) { 812 toInsertInto.insert(pos, fRuleText.tempSubString(0, pluralRuleStart)); 813 } 814 lengthOffset = fRuleText.length() - (toInsertInto.length() - initialLength); 815 } 816 817 if (sub2 != nullptr) { 818 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 819 } 820 if (sub1 != nullptr) { 821 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 822 } 823 } 824 825 /** 826 * Formats the number, and inserts the resulting text into 827 * toInsertInto. 828 * @param number The number being formatted 829 * @param toInsertInto The string where the resultant text should 830 * be inserted 831 * @param pos The position in toInsertInto where the resultant text 832 * should be inserted 833 */ 834 void 835 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos, int32_t recursionCount, UErrorCode& status) const 836 { 837 // first, insert the rule's rule text into toInsertInto at the 838 // specified position, then insert the results of the substitutions 839 // into the right places in toInsertInto 840 // [again, we have two copies of this routine that do the same thing 841 // so that we don't sacrifice precision in a long by casting it 842 // to a double] 843 int32_t pluralRuleStart = fRuleText.length(); 844 int32_t lengthOffset = 0; 845 if (!rulePatternFormat) { 846 toInsertInto.insert(pos, fRuleText); 847 } 848 else { 849 pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0); 850 int pluralRuleEnd = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart); 851 int initialLength = toInsertInto.length(); 852 if (pluralRuleEnd < fRuleText.length() - 1) { 853 toInsertInto.insert(pos, fRuleText.tempSubString(pluralRuleEnd + 2)); 854 } 855 double pluralVal = number; 856 if (0 <= pluralVal && pluralVal < 1) { 857 // We're in a fractional rule, and we have to match the NumeratorSubstitution behavior. 858 // 2.3 can become 0.2999999999999998 for the fraction due to rounding errors. 859 pluralVal = uprv_round(pluralVal * util64_pow(radix, exponent)); 860 } 861 else { 862 pluralVal = pluralVal / util64_pow(radix, exponent); 863 } 864 toInsertInto.insert(pos, rulePatternFormat->format(static_cast<int32_t>(pluralVal), status)); 865 if (pluralRuleStart > 0) { 866 toInsertInto.insert(pos, fRuleText.tempSubString(0, pluralRuleStart)); 867 } 868 lengthOffset = fRuleText.length() - (toInsertInto.length() - initialLength); 869 } 870 871 if (sub2 != nullptr) { 872 sub2->doSubstitution(number, toInsertInto, pos - (sub2->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 873 } 874 if (sub1 != nullptr) { 875 sub1->doSubstitution(number, toInsertInto, pos - (sub1->getPos() > pluralRuleStart ? lengthOffset : 0), recursionCount, status); 876 } 877 } 878 879 /** 880 * Used by the owning rule set to determine whether to invoke the 881 * rollback rule (i.e., whether this rule or the one that precedes 882 * it in the rule set's list should be used to format the number) 883 * @param The number being formatted 884 * @return True if the rule set should use the rule that precedes 885 * this one in its list; false if it should use this rule 886 */ 887 UBool 888 NFRule::shouldRollBack(int64_t number) const 889 { 890 // we roll back if the rule contains a modulus substitution, 891 // the number being formatted is an even multiple of the rule's 892 // divisor, and the rule's base value is NOT an even multiple 893 // of its divisor 894 // In other words, if the original description had 895 // 100: << hundred[ >>]; 896 // that expands into 897 // 100: << hundred; 898 // 101: << hundred >>; 899 // internally. But when we're formatting 200, if we use the rule 900 // at 101, which would normally apply, we get "two hundred zero". 901 // To prevent this, we roll back and use the rule at 100 instead. 902 // This is the logic that makes this happen: the rule at 101 has 903 // a modulus substitution, its base value isn't an even multiple 904 // of 100, and the value we're trying to format _is_ an even 905 // multiple of 100. This is called the "rollback rule." 906 if ((sub1 != nullptr && sub1->isModulusSubstitution()) || (sub2 != nullptr && sub2->isModulusSubstitution())) { 907 int64_t re = util64_pow(radix, exponent); 908 return (number % re) == 0 && (baseValue % re) != 0; 909 } 910 return false; 911 } 912 913 //----------------------------------------------------------------------- 914 // parsing 915 //----------------------------------------------------------------------- 916 917 /** 918 * Attempts to parse the string with this rule. 919 * @param text The string being parsed 920 * @param parsePosition On entry, the value is ignored and assumed to 921 * be 0. On exit, this has been updated with the position of the first 922 * character not consumed by matching the text against this rule 923 * (if this rule doesn't match the text at all, the parse position 924 * if left unchanged (presumably at 0) and the function returns 925 * new Long(0)). 926 * @param isFractionRule True if this rule is contained within a 927 * fraction rule set. This is only used if the rule has no 928 * substitutions. 929 * @return If this rule matched the text, this is the rule's base value 930 * combined appropriately with the results of parsing the substitutions. 931 * If nothing matched, this is new Long(0) and the parse position is 932 * left unchanged. The result will be an instance of Long if the 933 * result is an integer and Double otherwise. The result is never null. 934 */ 935 #ifdef RBNF_DEBUG 936 #include <stdio.h> 937 938 static void dumpUS(FILE* f, const UnicodeString& us) { 939 int len = us.length(); 940 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; 941 if (buf != nullptr) { 942 us.extract(0, len, buf); 943 buf[len] = 0; 944 fprintf(f, "%s", buf); 945 uprv_free(buf); //delete[] buf; 946 } 947 } 948 #endif 949 UBool 950 NFRule::doParse(const UnicodeString& text, 951 ParsePosition& parsePosition, 952 UBool isFractionRule, 953 double upperBound, 954 uint32_t nonNumericalExecutedRuleMask, 955 int32_t recursionCount, 956 Formattable& resVal) const 957 { 958 // internally we operate on a copy of the string being parsed 959 // (because we're going to change it) and use our own ParsePosition 960 ParsePosition pp; 961 UnicodeString workText(text); 962 963 int32_t sub1Pos = sub1 != nullptr ? sub1->getPos() : fRuleText.length(); 964 int32_t sub2Pos = sub2 != nullptr ? sub2->getPos() : fRuleText.length(); 965 966 // check to see whether the text before the first substitution 967 // matches the text at the beginning of the string being 968 // parsed. If it does, strip that off the front of workText; 969 // otherwise, dump out with a mismatch 970 UnicodeString prefix; 971 prefix.setTo(fRuleText, 0, sub1Pos); 972 973 #ifdef RBNF_DEBUG 974 fprintf(stderr, "doParse %p ", this); 975 { 976 UnicodeString rt; 977 _appendRuleText(rt); 978 dumpUS(stderr, rt); 979 } 980 981 fprintf(stderr, " text: '"); 982 dumpUS(stderr, text); 983 fprintf(stderr, "' prefix: '"); 984 dumpUS(stderr, prefix); 985 #endif 986 stripPrefix(workText, prefix, pp); 987 int32_t prefixLength = text.length() - workText.length(); 988 989 #ifdef RBNF_DEBUG 990 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1Pos); 991 #endif 992 993 if (pp.getIndex() == 0 && sub1Pos != 0) { 994 // commented out because ParsePosition doesn't have error index in 1.1.x 995 // restored for ICU4C port 996 parsePosition.setErrorIndex(pp.getErrorIndex()); 997 resVal.setLong(0); 998 return true; 999 } 1000 if (baseValue == kInfinityRule) { 1001 // If you match this, don't try to perform any calculations on it. 1002 parsePosition.setIndex(pp.getIndex()); 1003 resVal.setDouble(uprv_getInfinity()); 1004 return true; 1005 } 1006 if (baseValue == kNaNRule) { 1007 // If you match this, don't try to perform any calculations on it. 1008 parsePosition.setIndex(pp.getIndex()); 1009 resVal.setDouble(uprv_getNaN()); 1010 return true; 1011 } 1012 1013 // this is the fun part. The basic guts of the rule-matching 1014 // logic is matchToDelimiter(), which is called twice. The first 1015 // time it searches the input string for the rule text BETWEEN 1016 // the substitutions and tries to match the intervening text 1017 // in the input string with the first substitution. If that 1018 // succeeds, it then calls it again, this time to look for the 1019 // rule text after the second substitution and to match the 1020 // intervening input text against the second substitution. 1021 // 1022 // For example, say we have a rule that looks like this: 1023 // first << middle >> last; 1024 // and input text that looks like this: 1025 // first one middle two last 1026 // First we use stripPrefix() to match "first " in both places and 1027 // strip it off the front, leaving 1028 // one middle two last 1029 // Then we use matchToDelimiter() to match " middle " and try to 1030 // match "one" against a substitution. If it's successful, we now 1031 // have 1032 // two last 1033 // We use matchToDelimiter() a second time to match " last" and 1034 // try to match "two" against a substitution. If "two" matches 1035 // the substitution, we have a successful parse. 1036 // 1037 // Since it's possible in many cases to find multiple instances 1038 // of each of these pieces of rule text in the input string, 1039 // we need to try all the possible combinations of these 1040 // locations. This prevents us from prematurely declaring a mismatch, 1041 // and makes sure we match as much input text as we can. 1042 int highWaterMark = 0; 1043 double result = 0; 1044 int start = 0; 1045 double tempBaseValue = static_cast<double>(baseValue <= 0 ? 0 : baseValue); 1046 1047 UnicodeString temp; 1048 do { 1049 // our partial parse result starts out as this rule's base 1050 // value. If it finds a successful match, matchToDelimiter() 1051 // will compose this in some way with what it gets back from 1052 // the substitution, giving us a new partial parse result 1053 pp.setIndex(0); 1054 1055 temp.setTo(fRuleText, sub1Pos, sub2Pos - sub1Pos); 1056 double partialResult = matchToDelimiter(workText, start, tempBaseValue, 1057 temp, pp, sub1, 1058 nonNumericalExecutedRuleMask, 1059 recursionCount, 1060 upperBound); 1061 1062 // if we got a successful match (or were trying to match a 1063 // null substitution), pp is now pointing at the first unmatched 1064 // character. Take note of that, and try matchToDelimiter() 1065 // on the input text again 1066 if (pp.getIndex() != 0 || sub1 == nullptr) { 1067 start = pp.getIndex(); 1068 1069 UnicodeString workText2; 1070 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); 1071 ParsePosition pp2; 1072 1073 // the second matchToDelimiter() will compose our previous 1074 // partial result with whatever it gets back from its 1075 // substitution if there's a successful match, giving us 1076 // a real result 1077 temp.setTo(fRuleText, sub2Pos, fRuleText.length() - sub2Pos); 1078 partialResult = matchToDelimiter(workText2, 0, partialResult, 1079 temp, pp2, sub2, 1080 nonNumericalExecutedRuleMask, 1081 recursionCount, 1082 upperBound); 1083 1084 // if we got a successful match on this second 1085 // matchToDelimiter() call, update the high-water mark 1086 // and result (if necessary) 1087 if (pp2.getIndex() != 0 || sub2 == nullptr) { 1088 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { 1089 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); 1090 result = partialResult; 1091 } 1092 } 1093 else { 1094 // commented out because ParsePosition doesn't have error index in 1.1.x 1095 // restored for ICU4C port 1096 int32_t i_temp = pp2.getErrorIndex() + sub1Pos + pp.getIndex(); 1097 if (i_temp> parsePosition.getErrorIndex()) { 1098 parsePosition.setErrorIndex(i_temp); 1099 } 1100 } 1101 } 1102 else { 1103 // commented out because ParsePosition doesn't have error index in 1.1.x 1104 // restored for ICU4C port 1105 int32_t i_temp = sub1Pos + pp.getErrorIndex(); 1106 if (i_temp > parsePosition.getErrorIndex()) { 1107 parsePosition.setErrorIndex(i_temp); 1108 } 1109 } 1110 // keep trying to match things until the outer matchToDelimiter() 1111 // call fails to make a match (each time, it picks up where it 1112 // left off the previous time) 1113 } while (sub1Pos != sub2Pos 1114 && pp.getIndex() > 0 1115 && pp.getIndex() < workText.length() 1116 && pp.getIndex() != start); 1117 1118 // update the caller's ParsePosition with our high-water mark 1119 // (i.e., it now points at the first character this function 1120 // didn't match-- the ParsePosition is therefore unchanged if 1121 // we didn't match anything) 1122 parsePosition.setIndex(highWaterMark); 1123 // commented out because ParsePosition doesn't have error index in 1.1.x 1124 // restored for ICU4C port 1125 if (highWaterMark > 0) { 1126 parsePosition.setErrorIndex(0); 1127 } 1128 1129 // this is a hack for one unusual condition: Normally, whether this 1130 // rule belong to a fraction rule set or not is handled by its 1131 // substitutions. But if that rule HAS NO substitutions, then 1132 // we have to account for it here. By definition, if the matching 1133 // rule in a fraction rule set has no substitutions, its numerator 1134 // is 1, and so the result is the reciprocal of its base value. 1135 if (isFractionRule && highWaterMark > 0 && sub1 == nullptr) { 1136 result = 1 / result; 1137 } 1138 1139 resVal.setDouble(result); 1140 return true; // ??? do we need to worry if it is a long or a double? 1141 } 1142 1143 /** 1144 * This function is used by parse() to match the text being parsed 1145 * against a possible prefix string. This function 1146 * matches characters from the beginning of the string being parsed 1147 * to characters from the prospective prefix. If they match, pp is 1148 * updated to the first character not matched, and the result is 1149 * the unparsed part of the string. If they don't match, the whole 1150 * string is returned, and pp is left unchanged. 1151 * @param text The string being parsed 1152 * @param prefix The text to match against 1153 * @param pp On entry, ignored and assumed to be 0. On exit, points 1154 * to the first unmatched character (assuming the whole prefix matched), 1155 * or is unchanged (if the whole prefix didn't match). 1156 * @return If things match, this is the unparsed part of "text"; 1157 * if they didn't match, this is "text". 1158 */ 1159 void 1160 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const 1161 { 1162 // if the prefix text is empty, dump out without doing anything 1163 if (prefix.length() != 0) { 1164 UErrorCode status = U_ZERO_ERROR; 1165 // use prefixLength() to match the beginning of 1166 // "text" against "prefix". This function returns the 1167 // number of characters from "text" that matched (or 0 if 1168 // we didn't match the whole prefix) 1169 int32_t pfl = prefixLength(text, prefix, status); 1170 if (U_FAILURE(status)) { // Memory allocation error. 1171 return; 1172 } 1173 if (pfl != 0) { 1174 // if we got a successful match, update the parse position 1175 // and strip the prefix off of "text" 1176 pp.setIndex(pp.getIndex() + pfl); 1177 text.remove(0, pfl); 1178 } 1179 } 1180 } 1181 1182 /** 1183 * Used by parse() to match a substitution and any following text. 1184 * "text" is searched for instances of "delimiter". For each instance 1185 * of delimiter, the intervening text is tested to see whether it 1186 * matches the substitution. The longest match wins. 1187 * @param text The string being parsed 1188 * @param startPos The position in "text" where we should start looking 1189 * for "delimiter". 1190 * @param baseValue A partial parse result (often the rule's base value), 1191 * which is combined with the result from matching the substitution 1192 * @param delimiter The string to search "text" for. 1193 * @param pp Ignored and presumed to be 0 on entry. If there's a match, 1194 * on exit this will point to the first unmatched character. 1195 * @param sub If we find "delimiter" in "text", this substitution is used 1196 * to match the text between the beginning of the string and the 1197 * position of "delimiter." (If "delimiter" is the empty string, then 1198 * this function just matches against this substitution and updates 1199 * everything accordingly.) 1200 * @param upperBound When matching the substitution, it will only 1201 * consider rules with base values lower than this value. 1202 * @return If there's a match, this is the result of composing 1203 * baseValue with the result of matching the substitution. Otherwise, 1204 * this is new Long(0). It's never null. If the result is an integer, 1205 * this will be an instance of Long; otherwise, it's an instance of 1206 * Double. 1207 * 1208 * !!! note {dlf} in point of fact, in the java code the caller always converts 1209 * the result to a double, so we might as well return one. 1210 */ 1211 double 1212 NFRule::matchToDelimiter(const UnicodeString& text, 1213 int32_t startPos, 1214 double _baseValue, 1215 const UnicodeString& delimiter, 1216 ParsePosition& pp, 1217 const NFSubstitution* sub, 1218 uint32_t nonNumericalExecutedRuleMask, 1219 int32_t recursionCount, 1220 double upperBound) const 1221 { 1222 UErrorCode status = U_ZERO_ERROR; 1223 // if "delimiter" contains real (i.e., non-ignorable) text, search 1224 // it for "delimiter" beginning at "start". If that succeeds, then 1225 // use "sub"'s doParse() method to match the text before the 1226 // instance of "delimiter" we just found. 1227 if (!allIgnorable(delimiter, status)) { 1228 if (U_FAILURE(status)) { //Memory allocation error. 1229 return 0; 1230 } 1231 ParsePosition tempPP; 1232 Formattable result; 1233 1234 // use findText() to search for "delimiter". It returns a two- 1235 // element array: element 0 is the position of the match, and 1236 // element 1 is the number of characters that matched 1237 // "delimiter". 1238 int32_t dLen; 1239 int32_t dPos = findText(text, delimiter, startPos, &dLen); 1240 1241 // if findText() succeeded, isolate the text preceding the 1242 // match, and use "sub" to match that text 1243 while (dPos >= 0) { 1244 UnicodeString subText; 1245 subText.setTo(text, 0, dPos); 1246 if (subText.length() > 0) { 1247 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, 1248 #if UCONFIG_NO_COLLATION 1249 false, 1250 #else 1251 formatter->isLenient(), 1252 #endif 1253 nonNumericalExecutedRuleMask, 1254 recursionCount, 1255 result); 1256 1257 // if the substitution could match all the text up to 1258 // where we found "delimiter", then this function has 1259 // a successful match. Bump the caller's parse position 1260 // to point to the first character after the text 1261 // that matches "delimiter", and return the result 1262 // we got from parsing the substitution. 1263 if (success && tempPP.getIndex() == dPos) { 1264 pp.setIndex(dPos + dLen); 1265 return result.getDouble(); 1266 } 1267 else { 1268 // commented out because ParsePosition doesn't have error index in 1.1.x 1269 // restored for ICU4C port 1270 if (tempPP.getErrorIndex() > 0) { 1271 pp.setErrorIndex(tempPP.getErrorIndex()); 1272 } else { 1273 pp.setErrorIndex(tempPP.getIndex()); 1274 } 1275 } 1276 } 1277 1278 // if we didn't match the substitution, search for another 1279 // copy of "delimiter" in "text" and repeat the loop if 1280 // we find it 1281 tempPP.setIndex(0); 1282 dPos = findText(text, delimiter, dPos + dLen, &dLen); 1283 } 1284 // if we make it here, this was an unsuccessful match, and we 1285 // leave pp unchanged and return 0 1286 pp.setIndex(0); 1287 return 0; 1288 1289 // if "delimiter" is empty, or consists only of ignorable characters 1290 // (i.e., is semantically empty), thwe we obviously can't search 1291 // for "delimiter". Instead, just use "sub" to parse as much of 1292 // "text" as possible. 1293 } 1294 else if (sub == nullptr) { 1295 return _baseValue; 1296 } 1297 else { 1298 ParsePosition tempPP; 1299 Formattable result; 1300 1301 // try to match the whole string against the substitution 1302 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, 1303 #if UCONFIG_NO_COLLATION 1304 false, 1305 #else 1306 formatter->isLenient(), 1307 #endif 1308 nonNumericalExecutedRuleMask, 1309 recursionCount, 1310 result); 1311 if (success && (tempPP.getIndex() != 0)) { 1312 // if there's a successful match (or it's a null 1313 // substitution), update pp to point to the first 1314 // character we didn't match, and pass the result from 1315 // sub.doParse() on through to the caller 1316 pp.setIndex(tempPP.getIndex()); 1317 return result.getDouble(); 1318 } 1319 else { 1320 // commented out because ParsePosition doesn't have error index in 1.1.x 1321 // restored for ICU4C port 1322 pp.setErrorIndex(tempPP.getErrorIndex()); 1323 } 1324 1325 // and if we get to here, then nothing matched, so we return 1326 // 0 and leave pp alone 1327 return 0; 1328 } 1329 } 1330 1331 /** 1332 * Used by stripPrefix() to match characters. If lenient parse mode 1333 * is off, this just calls startsWith(). If lenient parse mode is on, 1334 * this function uses CollationElementIterators to match characters in 1335 * the strings (only primary-order differences are significant in 1336 * determining whether there's a match). 1337 * @param str The string being tested 1338 * @param prefix The text we're hoping to see at the beginning 1339 * of "str" 1340 * @return If "prefix" is found at the beginning of "str", this 1341 * is the number of characters in "str" that were matched (this 1342 * isn't necessarily the same as the length of "prefix" when matching 1343 * text with a collator). If there's no match, this is 0. 1344 */ 1345 int32_t 1346 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const 1347 { 1348 // if we're looking for an empty prefix, it obviously matches 1349 // zero characters. Just go ahead and return 0. 1350 if (prefix.length() == 0) { 1351 return 0; 1352 } 1353 1354 #if !UCONFIG_NO_COLLATION 1355 // go through all this grief if we're in lenient-parse mode 1356 if (formatter->isLenient()) { 1357 // Check if non-lenient rule finds the text before call lenient parsing 1358 if (str.startsWith(prefix)) { 1359 return prefix.length(); 1360 } 1361 // get the formatter's collator and use it to create two 1362 // collation element iterators, one over the target string 1363 // and another over the prefix (right now, we'll throw an 1364 // exception if the collator we get back from the formatter 1365 // isn't a RuleBasedCollator, because RuleBasedCollator defines 1366 // the CollationElementIterator protocol. Hopefully, this 1367 // will change someday.) 1368 const RuleBasedCollator* collator = formatter->getCollator(); 1369 if (collator == nullptr) { 1370 status = U_MEMORY_ALLOCATION_ERROR; 1371 return 0; 1372 } 1373 LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str)); 1374 LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix)); 1375 // Check for memory allocation error. 1376 if (strIter.isNull() || prefixIter.isNull()) { 1377 status = U_MEMORY_ALLOCATION_ERROR; 1378 return 0; 1379 } 1380 1381 UErrorCode err = U_ZERO_ERROR; 1382 1383 // The original code was problematic. Consider this match: 1384 // prefix = "fifty-" 1385 // string = " fifty-7" 1386 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 1387 // in the string. Unfortunately, we were getting a match, and then computing where 1388 // the match terminated by rematching the string. The rematch code was using as an 1389 // initial guess the substring of string between 0 and prefix.length. Because of 1390 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving 1391 // the position before the hyphen in the string. Recursing down, we then parsed the 1392 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). 1393 // This was not pretty, especially since the string "fifty-7" parsed just fine. 1394 // 1395 // We have newer APIs now, so we can use calls on the iterator to determine what we 1396 // matched up to. If we terminate because we hit the last element in the string, 1397 // our match terminates at this length. If we terminate because we hit the last element 1398 // in the target, our match terminates at one before the element iterator position. 1399 1400 // match collation elements between the strings 1401 int32_t oStr = strIter->next(err); 1402 int32_t oPrefix = prefixIter->next(err); 1403 1404 while (oPrefix != CollationElementIterator::NULLORDER) { 1405 // skip over ignorable characters in the target string 1406 while (CollationElementIterator::primaryOrder(oStr) == 0 1407 && oStr != CollationElementIterator::NULLORDER) { 1408 oStr = strIter->next(err); 1409 } 1410 1411 // skip over ignorable characters in the prefix 1412 while (CollationElementIterator::primaryOrder(oPrefix) == 0 1413 && oPrefix != CollationElementIterator::NULLORDER) { 1414 oPrefix = prefixIter->next(err); 1415 } 1416 1417 // dlf: move this above following test, if we consume the 1418 // entire target, aren't we ok even if the source was also 1419 // entirely consumed? 1420 1421 // if skipping over ignorables brought to the end of 1422 // the prefix, we DID match: drop out of the loop 1423 if (oPrefix == CollationElementIterator::NULLORDER) { 1424 break; 1425 } 1426 1427 // if skipping over ignorables brought us to the end 1428 // of the target string, we didn't match and return 0 1429 if (oStr == CollationElementIterator::NULLORDER) { 1430 return 0; 1431 } 1432 1433 // match collation elements from the two strings 1434 // (considering only primary differences). If we 1435 // get a mismatch, dump out and return 0 1436 if (CollationElementIterator::primaryOrder(oStr) 1437 != CollationElementIterator::primaryOrder(oPrefix)) { 1438 return 0; 1439 1440 // otherwise, advance to the next character in each string 1441 // and loop (we drop out of the loop when we exhaust 1442 // collation elements in the prefix) 1443 } else { 1444 oStr = strIter->next(err); 1445 oPrefix = prefixIter->next(err); 1446 } 1447 } 1448 1449 int32_t result = strIter->getOffset(); 1450 if (oStr != CollationElementIterator::NULLORDER) { 1451 --result; // back over character that we don't want to consume; 1452 } 1453 1454 #ifdef RBNF_DEBUG 1455 fprintf(stderr, "prefix length: %d\n", result); 1456 #endif 1457 return result; 1458 #if 0 1459 //---------------------------------------------------------------- 1460 // JDK 1.2-specific API call 1461 // return strIter.getOffset(); 1462 //---------------------------------------------------------------- 1463 // JDK 1.1 HACK (take out for 1.2-specific code) 1464 1465 // if we make it to here, we have a successful match. Now we 1466 // have to find out HOW MANY characters from the target string 1467 // matched the prefix (there isn't necessarily a one-to-one 1468 // mapping between collation elements and characters). 1469 // In JDK 1.2, there's a simple getOffset() call we can use. 1470 // In JDK 1.1, on the other hand, we have to go through some 1471 // ugly contortions. First, use the collator to compare the 1472 // same number of characters from the prefix and target string. 1473 // If they're equal, we're done. 1474 collator->setStrength(Collator::PRIMARY); 1475 if (str.length() >= prefix.length()) { 1476 UnicodeString temp; 1477 temp.setTo(str, 0, prefix.length()); 1478 if (collator->equals(temp, prefix)) { 1479 #ifdef RBNF_DEBUG 1480 fprintf(stderr, "returning: %d\n", prefix.length()); 1481 #endif 1482 return prefix.length(); 1483 } 1484 } 1485 1486 // if they're not equal, then we have to compare successively 1487 // larger and larger substrings of the target string until we 1488 // get to one that matches the prefix. At that point, we know 1489 // how many characters matched the prefix, and we can return. 1490 int32_t p = 1; 1491 while (p <= str.length()) { 1492 UnicodeString temp; 1493 temp.setTo(str, 0, p); 1494 if (collator->equals(temp, prefix)) { 1495 return p; 1496 } else { 1497 ++p; 1498 } 1499 } 1500 1501 // SHOULD NEVER GET HERE!!! 1502 return 0; 1503 //---------------------------------------------------------------- 1504 #endif 1505 1506 // If lenient parsing is turned off, forget all that crap above. 1507 // Just use String.startsWith() and be done with it. 1508 } else 1509 #endif 1510 { 1511 if (str.startsWith(prefix)) { 1512 return prefix.length(); 1513 } else { 1514 return 0; 1515 } 1516 } 1517 } 1518 1519 /** 1520 * Searches a string for another string. If lenient parsing is off, 1521 * this just calls indexOf(). If lenient parsing is on, this function 1522 * uses CollationElementIterator to match characters, and only 1523 * primary-order differences are significant in determining whether 1524 * there's a match. 1525 * @param str The string to search 1526 * @param key The string to search "str" for 1527 * @param startingAt The index into "str" where the search is to 1528 * begin 1529 * @return A two-element array of ints. Element 0 is the position 1530 * of the match, or -1 if there was no match. Element 1 is the 1531 * number of characters in "str" that matched (which isn't necessarily 1532 * the same as the length of "key") 1533 */ 1534 int32_t 1535 NFRule::findText(const UnicodeString& str, 1536 const UnicodeString& key, 1537 int32_t startingAt, 1538 int32_t* length) const 1539 { 1540 if (rulePatternFormat) { 1541 Formattable result; 1542 FieldPosition position(UNUM_INTEGER_FIELD); 1543 position.setBeginIndex(startingAt); 1544 rulePatternFormat->parseType(str, this, result, position); 1545 int start = position.getBeginIndex(); 1546 if (start >= 0) { 1547 int32_t pluralRuleStart = fRuleText.indexOf(gDollarOpenParenthesis, -1, 0); 1548 int32_t pluralRuleSuffix = fRuleText.indexOf(gClosedParenthesisDollar, -1, pluralRuleStart) + 2; 1549 int32_t matchLen = position.getEndIndex() - start; 1550 UnicodeString prefix(fRuleText.tempSubString(0, pluralRuleStart)); 1551 UnicodeString suffix(fRuleText.tempSubString(pluralRuleSuffix)); 1552 if (str.compare(start - prefix.length(), prefix.length(), prefix, 0, prefix.length()) == 0 1553 && str.compare(start + matchLen, suffix.length(), suffix, 0, suffix.length()) == 0) 1554 { 1555 *length = matchLen + prefix.length() + suffix.length(); 1556 return start - prefix.length(); 1557 } 1558 } 1559 *length = 0; 1560 return -1; 1561 } 1562 if (!formatter->isLenient()) { 1563 // if lenient parsing is turned off, this is easy: just call 1564 // String.indexOf() and we're done 1565 *length = key.length(); 1566 return str.indexOf(key, startingAt); 1567 } 1568 else { 1569 // Check if non-lenient rule finds the text before call lenient parsing 1570 *length = key.length(); 1571 int32_t pos = str.indexOf(key, startingAt); 1572 if(pos >= 0) { 1573 return pos; 1574 } else { 1575 // but if lenient parsing is turned ON, we've got some work ahead of us 1576 return findTextLenient(str, key, startingAt, length); 1577 } 1578 } 1579 } 1580 1581 int32_t 1582 NFRule::findTextLenient(const UnicodeString& str, 1583 const UnicodeString& key, 1584 int32_t startingAt, 1585 int32_t* length) const 1586 { 1587 //---------------------------------------------------------------- 1588 // JDK 1.1 HACK (take out of 1.2-specific code) 1589 1590 // in JDK 1.2, CollationElementIterator provides us with an 1591 // API to map between character offsets and collation elements 1592 // and we can do this by marching through the string comparing 1593 // collation elements. We can't do that in JDK 1.1. Instead, 1594 // we have to go through this horrible slow mess: 1595 int32_t p = startingAt; 1596 int32_t keyLen = 0; 1597 1598 // basically just isolate smaller and smaller substrings of 1599 // the target string (each running to the end of the string, 1600 // and with the first one running from startingAt to the end) 1601 // and then use prefixLength() to see if the search key is at 1602 // the beginning of each substring. This is excruciatingly 1603 // slow, but it will locate the key and tell use how long the 1604 // matching text was. 1605 UnicodeString temp; 1606 UErrorCode status = U_ZERO_ERROR; 1607 while (p < str.length() && keyLen == 0) { 1608 temp.setTo(str, p, str.length() - p); 1609 keyLen = prefixLength(temp, key, status); 1610 if (U_FAILURE(status)) { 1611 break; 1612 } 1613 if (keyLen != 0) { 1614 *length = keyLen; 1615 return p; 1616 } 1617 ++p; 1618 } 1619 // if we make it to here, we didn't find it. Return -1 for the 1620 // location. The length should be ignored, but set it to 0, 1621 // which should be "safe" 1622 *length = 0; 1623 return -1; 1624 } 1625 1626 /** 1627 * Checks to see whether a string consists entirely of ignorable 1628 * characters. 1629 * @param str The string to test. 1630 * @return true if the string is empty of consists entirely of 1631 * characters that the number formatter's collator says are 1632 * ignorable at the primary-order level. false otherwise. 1633 */ 1634 UBool 1635 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const 1636 { 1637 // if the string is empty, we can just return true 1638 if (str.length() == 0) { 1639 return true; 1640 } 1641 1642 #if !UCONFIG_NO_COLLATION 1643 // if lenient parsing is turned on, walk through the string with 1644 // a collation element iterator and make sure each collation 1645 // element is 0 (ignorable) at the primary level 1646 if (formatter->isLenient()) { 1647 const RuleBasedCollator* collator = formatter->getCollator(); 1648 if (collator == nullptr) { 1649 status = U_MEMORY_ALLOCATION_ERROR; 1650 return false; 1651 } 1652 LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str)); 1653 1654 // Memory allocation error check. 1655 if (iter.isNull()) { 1656 status = U_MEMORY_ALLOCATION_ERROR; 1657 return false; 1658 } 1659 1660 UErrorCode err = U_ZERO_ERROR; 1661 int32_t o = iter->next(err); 1662 while (o != CollationElementIterator::NULLORDER 1663 && CollationElementIterator::primaryOrder(o) == 0) { 1664 o = iter->next(err); 1665 } 1666 1667 return o == CollationElementIterator::NULLORDER; 1668 } 1669 #endif 1670 1671 // if lenient parsing is turned off, there is no such thing as 1672 // an ignorable character: return true only if the string is empty 1673 return false; 1674 } 1675 1676 void 1677 NFRule::setDecimalFormatSymbols(const DecimalFormatSymbols& newSymbols, UErrorCode& status) { 1678 if (sub1 != nullptr) { 1679 sub1->setDecimalFormatSymbols(newSymbols, status); 1680 } 1681 if (sub2 != nullptr) { 1682 sub2->setDecimalFormatSymbols(newSymbols, status); 1683 } 1684 } 1685 1686 U_NAMESPACE_END 1687 1688 /* U_HAVE_RBNF */ 1689 #endif