number_affixutils.cpp (15622B)
1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 #include "number_affixutils.h" 9 #include "unicode/utf16.h" 10 #include "unicode/uniset.h" 11 12 using namespace icu; 13 using namespace icu::number; 14 using namespace icu::number::impl; 15 16 TokenConsumer::~TokenConsumer() = default; 17 SymbolProvider::~SymbolProvider() = default; 18 19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { 20 AffixPatternState state = STATE_BASE; 21 int32_t offset = 0; 22 int32_t length = 0; 23 for (; offset < patternString.length();) { 24 UChar32 cp = patternString.char32At(offset); 25 26 switch (state) { 27 case STATE_BASE: 28 if (cp == u'\'') { 29 // First quote 30 state = STATE_FIRST_QUOTE; 31 } else { 32 // Unquoted symbol 33 length++; 34 } 35 break; 36 case STATE_FIRST_QUOTE: 37 if (cp == u'\'') { 38 // Repeated quote 39 length++; 40 state = STATE_BASE; 41 } else { 42 // Quoted code point 43 length++; 44 state = STATE_INSIDE_QUOTE; 45 } 46 break; 47 case STATE_INSIDE_QUOTE: 48 if (cp == u'\'') { 49 // End of quoted sequence 50 state = STATE_AFTER_QUOTE; 51 } else { 52 // Quoted code point 53 length++; 54 } 55 break; 56 case STATE_AFTER_QUOTE: 57 if (cp == u'\'') { 58 // Double quote inside of quoted sequence 59 length++; 60 state = STATE_INSIDE_QUOTE; 61 } else { 62 // Unquoted symbol 63 length++; 64 } 65 break; 66 default: 67 UPRV_UNREACHABLE_EXIT; 68 } 69 70 offset += U16_LENGTH(cp); 71 } 72 73 switch (state) { 74 case STATE_FIRST_QUOTE: 75 case STATE_INSIDE_QUOTE: 76 status = U_ILLEGAL_ARGUMENT_ERROR; 77 break; 78 default: 79 break; 80 } 81 82 return length; 83 } 84 85 UnicodeString AffixUtils::escape(const UnicodeString &input) { 86 AffixPatternState state = STATE_BASE; 87 int32_t offset = 0; 88 UnicodeString output; 89 for (; offset < input.length();) { 90 UChar32 cp = input.char32At(offset); 91 92 switch (cp) { 93 case u'\'': 94 output.append(u"''", -1); 95 break; 96 97 case u'-': 98 case u'+': 99 case u'%': 100 case u'‰': 101 case u'¤': 102 if (state == STATE_BASE) { 103 output.append(u'\''); 104 output.append(cp); 105 state = STATE_INSIDE_QUOTE; 106 } else { 107 output.append(cp); 108 } 109 break; 110 111 default: 112 if (state == STATE_INSIDE_QUOTE) { 113 output.append(u'\''); 114 output.append(cp); 115 state = STATE_BASE; 116 } else { 117 output.append(cp); 118 } 119 break; 120 } 121 offset += U16_LENGTH(cp); 122 } 123 124 if (state == STATE_INSIDE_QUOTE) { 125 output.append(u'\''); 126 } 127 128 return output; 129 } 130 131 Field AffixUtils::getFieldForType(AffixPatternType type) { 132 switch (type) { 133 case TYPE_MINUS_SIGN: 134 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; 135 case TYPE_PLUS_SIGN: 136 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; 137 case TYPE_APPROXIMATELY_SIGN: 138 return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD}; 139 case TYPE_PERCENT: 140 return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD}; 141 case TYPE_PERMILLE: 142 return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD}; 143 case TYPE_CURRENCY_SINGLE: 144 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 145 case TYPE_CURRENCY_DOUBLE: 146 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 147 case TYPE_CURRENCY_TRIPLE: 148 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 149 case TYPE_CURRENCY_QUAD: 150 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 151 case TYPE_CURRENCY_QUINT: 152 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 153 case TYPE_CURRENCY_OVERFLOW: 154 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; 155 default: 156 UPRV_UNREACHABLE_EXIT; 157 } 158 } 159 160 int32_t 161 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position, 162 const SymbolProvider &provider, Field field, UErrorCode &status) { 163 int32_t length = 0; 164 AffixTag tag; 165 while (hasNext(tag, affixPattern)) { 166 tag = nextToken(tag, affixPattern, status); 167 if (U_FAILURE(status)) { return length; } 168 if (tag.type == TYPE_CURRENCY_OVERFLOW) { 169 // Don't go to the provider for this special case 170 length += output.insertCodePoint( 171 position + length, 172 0xFFFD, 173 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, 174 status); 175 } else if (tag.type < 0) { 176 length += output.insert( 177 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); 178 } else { 179 length += output.insertCodePoint(position + length, tag.codePoint, field, status); 180 } 181 } 182 return length; 183 } 184 185 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, 186 const SymbolProvider &provider, UErrorCode &status) { 187 int32_t length = 0; 188 AffixTag tag; 189 while (hasNext(tag, affixPattern)) { 190 tag = nextToken(tag, affixPattern, status); 191 if (U_FAILURE(status)) { return length; } 192 if (tag.type == TYPE_CURRENCY_OVERFLOW) { 193 length += 1; 194 } else if (tag.type < 0) { 195 length += provider.getSymbol(tag.type).length(); 196 } else { 197 length += U16_LENGTH(tag.codePoint); 198 } 199 } 200 return length; 201 } 202 203 bool 204 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { 205 if (affixPattern.length() == 0) { 206 return false; 207 } 208 AffixTag tag; 209 while (hasNext(tag, affixPattern)) { 210 tag = nextToken(tag, affixPattern, status); 211 if (U_FAILURE(status)) { return false; } 212 if (tag.type == type) { 213 return true; 214 } 215 } 216 return false; 217 } 218 219 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { 220 if (affixPattern.length() == 0) { 221 return false; 222 } 223 AffixTag tag; 224 while (hasNext(tag, affixPattern)) { 225 tag = nextToken(tag, affixPattern, status); 226 if (U_FAILURE(status)) { return false; } 227 if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { 228 return true; 229 } 230 } 231 return false; 232 } 233 234 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, 235 char16_t replacementChar, UErrorCode &status) { 236 UnicodeString output(affixPattern); // copy 237 if (affixPattern.length() == 0) { 238 return output; 239 } 240 AffixTag tag; 241 while (hasNext(tag, affixPattern)) { 242 tag = nextToken(tag, affixPattern, status); 243 if (U_FAILURE(status)) { return output; } 244 if (tag.type == type) { 245 output.replace(tag.offset - 1, 1, replacementChar); 246 } 247 } 248 return output; 249 } 250 251 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, 252 const UnicodeSet& ignorables, UErrorCode& status) { 253 if (affixPattern.length() == 0) { 254 return true; 255 } 256 AffixTag tag; 257 while (hasNext(tag, affixPattern)) { 258 tag = nextToken(tag, affixPattern, status); 259 if (U_FAILURE(status)) { return false; } 260 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { 261 return false; 262 } 263 } 264 return true; 265 } 266 267 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, 268 UErrorCode& status) { 269 if (affixPattern.length() == 0) { 270 return; 271 } 272 AffixTag tag; 273 while (hasNext(tag, affixPattern)) { 274 tag = nextToken(tag, affixPattern, status); 275 if (U_FAILURE(status)) { return; } 276 consumer.consumeToken(tag.type, tag.codePoint, status); 277 if (U_FAILURE(status)) { return; } 278 } 279 } 280 281 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { 282 int32_t offset = tag.offset; 283 int32_t state = tag.state; 284 for (; offset < patternString.length();) { 285 UChar32 cp = patternString.char32At(offset); 286 int32_t count = U16_LENGTH(cp); 287 288 switch (state) { 289 case STATE_BASE: 290 switch (cp) { 291 case u'\'': 292 state = STATE_FIRST_QUOTE; 293 offset += count; 294 // continue to the next code point 295 break; 296 case u'-': 297 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); 298 case u'+': 299 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); 300 case u'~': 301 return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0); 302 case u'%': 303 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); 304 case u'‰': 305 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); 306 case u'¤': 307 state = STATE_FIRST_CURR; 308 offset += count; 309 // continue to the next code point 310 break; 311 default: 312 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 313 } 314 break; 315 case STATE_FIRST_QUOTE: 316 if (cp == u'\'') { 317 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); 318 } else { 319 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 320 } 321 case STATE_INSIDE_QUOTE: 322 if (cp == u'\'') { 323 state = STATE_AFTER_QUOTE; 324 offset += count; 325 // continue to the next code point 326 break; 327 } else { 328 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 329 } 330 case STATE_AFTER_QUOTE: 331 if (cp == u'\'') { 332 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); 333 } else { 334 state = STATE_BASE; 335 // re-evaluate this code point 336 break; 337 } 338 case STATE_FIRST_CURR: 339 if (cp == u'¤') { 340 state = STATE_SECOND_CURR; 341 offset += count; 342 // continue to the next code point 343 break; 344 } else { 345 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 346 } 347 case STATE_SECOND_CURR: 348 if (cp == u'¤') { 349 state = STATE_THIRD_CURR; 350 offset += count; 351 // continue to the next code point 352 break; 353 } else { 354 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 355 } 356 case STATE_THIRD_CURR: 357 if (cp == u'¤') { 358 state = STATE_FOURTH_CURR; 359 offset += count; 360 // continue to the next code point 361 break; 362 } else { 363 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 364 } 365 case STATE_FOURTH_CURR: 366 if (cp == u'¤') { 367 state = STATE_FIFTH_CURR; 368 offset += count; 369 // continue to the next code point 370 break; 371 } else { 372 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 373 } 374 case STATE_FIFTH_CURR: 375 if (cp == u'¤') { 376 state = STATE_OVERFLOW_CURR; 377 offset += count; 378 // continue to the next code point 379 break; 380 } else { 381 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 382 } 383 case STATE_OVERFLOW_CURR: 384 if (cp == u'¤') { 385 offset += count; 386 // continue to the next code point and loop back to this state 387 break; 388 } else { 389 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 390 } 391 default: 392 UPRV_UNREACHABLE_EXIT; 393 } 394 } 395 // End of string 396 switch (state) { 397 case STATE_BASE: 398 // No more tokens in string. 399 return {-1}; 400 case STATE_FIRST_QUOTE: 401 case STATE_INSIDE_QUOTE: 402 // For consistent behavior with the JDK and ICU 58, set an error here. 403 status = U_ILLEGAL_ARGUMENT_ERROR; 404 return {-1}; 405 case STATE_AFTER_QUOTE: 406 // No more tokens in string. 407 return {-1}; 408 case STATE_FIRST_CURR: 409 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); 410 case STATE_SECOND_CURR: 411 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); 412 case STATE_THIRD_CURR: 413 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); 414 case STATE_FOURTH_CURR: 415 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); 416 case STATE_FIFTH_CURR: 417 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); 418 case STATE_OVERFLOW_CURR: 419 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); 420 default: 421 UPRV_UNREACHABLE_EXIT; 422 } 423 } 424 425 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { 426 // First check for the {-1} and default initializer syntax. 427 if (tag.offset < 0) { 428 return false; 429 } else if (tag.offset == 0) { 430 return string.length() > 0; 431 } 432 // The rest of the fields are safe to use now. 433 // Special case: the last character in string is an end quote. 434 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && 435 string.charAt(tag.offset) == u'\'') { 436 return false; 437 } else if (tag.state != STATE_BASE) { 438 return true; 439 } else { 440 return tag.offset < string.length(); 441 } 442 } 443 444 #endif /* #if !UCONFIG_NO_FORMATTING */