measunit_extra.cpp (62452B)
1 // © 2020 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // Extra functions for MeasureUnit not needed for all clients. 5 // Separate .o file so that it can be removed for modularity. 6 7 #include "unicode/utypes.h" 8 9 #if !UCONFIG_NO_FORMATTING 10 11 // Allow implicit conversion from char16_t* to UnicodeString for this file: 12 // Helpful in toString methods and elsewhere. 13 #define UNISTR_FROM_STRING_EXPLICIT 14 15 #include "charstr.h" 16 #include "cmemory.h" 17 #include "cstring.h" 18 #ifdef JS_HAS_INTL_API 19 #include "double-conversion/string-to-double.h" 20 #else 21 #include "double-conversion-string-to-double.h" 22 #endif 23 #include "measunit_impl.h" 24 #include "resource.h" 25 #include "uarrsort.h" 26 #include "uassert.h" 27 #include "ucln_in.h" 28 #include "umutex.h" 29 #include "unicode/bytestrie.h" 30 #include "unicode/bytestriebuilder.h" 31 #include "unicode/localpointer.h" 32 #include "unicode/stringpiece.h" 33 #include "unicode/stringtriebuilder.h" 34 #include "unicode/ures.h" 35 #include "unicode/ustringtrie.h" 36 #include "uresimp.h" 37 #include "util.h" 38 #include <limits.h> 39 #include <cstdlib> 40 U_NAMESPACE_BEGIN 41 42 43 namespace { 44 45 #ifdef JS_HAS_INTL_API 46 using double_conversion::StringToDoubleConverter; 47 #else 48 using icu::double_conversion::StringToDoubleConverter; 49 #endif 50 51 // TODO: Propose a new error code for this? 52 constexpr UErrorCode kUnitIdentifierSyntaxError = U_ILLEGAL_ARGUMENT_ERROR; 53 54 // Trie value offset for SI or binary prefixes. This is big enough to ensure we only 55 // insert positive integers into the trie. 56 constexpr int32_t kPrefixOffset = 64; 57 static_assert(kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MIN_BIN > 0, 58 "kPrefixOffset is too small for minimum UMeasurePrefix value"); 59 static_assert(kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MIN_SI > 0, 60 "kPrefixOffset is too small for minimum UMeasurePrefix value"); 61 62 // Trie value offset for compound parts, e.g. "-per-", "-", "-and-". 63 constexpr int32_t kCompoundPartOffset = 128; 64 static_assert(kCompoundPartOffset > kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MAX_BIN, 65 "Ambiguous token values: prefix tokens are overlapping with CompoundPart tokens"); 66 static_assert(kCompoundPartOffset > kPrefixOffset + UMEASURE_PREFIX_INTERNAL_MAX_SI, 67 "Ambiguous token values: prefix tokens are overlapping with CompoundPart tokens"); 68 69 enum CompoundPart { 70 // Represents "-per-" 71 COMPOUND_PART_PER = kCompoundPartOffset, 72 // Represents "-" 73 COMPOUND_PART_TIMES, 74 // Represents "-and-" 75 COMPOUND_PART_AND, 76 }; 77 78 // Trie value offset for "per-". 79 constexpr int32_t kInitialCompoundPartOffset = 192; 80 81 enum InitialCompoundPart { 82 // Represents "per-", the only compound part that can appear at the start of 83 // an identifier. 84 INITIAL_COMPOUND_PART_PER = kInitialCompoundPartOffset, 85 }; 86 87 // Trie value offset for powers like "square-", "cubic-", "pow2-" etc. 88 constexpr int32_t kPowerPartOffset = 256; 89 90 enum PowerPart { 91 POWER_PART_P2 = kPowerPartOffset + 2, 92 POWER_PART_P3, 93 POWER_PART_P4, 94 POWER_PART_P5, 95 POWER_PART_P6, 96 POWER_PART_P7, 97 POWER_PART_P8, 98 POWER_PART_P9, 99 POWER_PART_P10, 100 POWER_PART_P11, 101 POWER_PART_P12, 102 POWER_PART_P13, 103 POWER_PART_P14, 104 POWER_PART_P15, 105 }; 106 107 // Trie value offset for simple units, e.g. "gram", "nautical-mile", 108 // "fluid-ounce-imperial". 109 constexpr int32_t kSimpleUnitOffset = 512; 110 111 // Trie value offset for aliases, e.g. "portion" replaced by "part" 112 constexpr int32_t kAliasOffset = 51200; // This will give a very big space for the units ids. 113 114 const struct UnitPrefixStrings { 115 const char* const string; 116 UMeasurePrefix value; 117 } gUnitPrefixStrings[] = { 118 // SI prefixes 119 { "quetta", UMEASURE_PREFIX_QUETTA }, 120 { "ronna", UMEASURE_PREFIX_RONNA }, 121 { "yotta", UMEASURE_PREFIX_YOTTA }, 122 { "zetta", UMEASURE_PREFIX_ZETTA }, 123 { "exa", UMEASURE_PREFIX_EXA }, 124 { "peta", UMEASURE_PREFIX_PETA }, 125 { "tera", UMEASURE_PREFIX_TERA }, 126 { "giga", UMEASURE_PREFIX_GIGA }, 127 { "mega", UMEASURE_PREFIX_MEGA }, 128 { "kilo", UMEASURE_PREFIX_KILO }, 129 { "hecto", UMEASURE_PREFIX_HECTO }, 130 { "deka", UMEASURE_PREFIX_DEKA }, 131 { "deci", UMEASURE_PREFIX_DECI }, 132 { "centi", UMEASURE_PREFIX_CENTI }, 133 { "milli", UMEASURE_PREFIX_MILLI }, 134 { "micro", UMEASURE_PREFIX_MICRO }, 135 { "nano", UMEASURE_PREFIX_NANO }, 136 { "pico", UMEASURE_PREFIX_PICO }, 137 { "femto", UMEASURE_PREFIX_FEMTO }, 138 { "atto", UMEASURE_PREFIX_ATTO }, 139 { "zepto", UMEASURE_PREFIX_ZEPTO }, 140 { "yocto", UMEASURE_PREFIX_YOCTO }, 141 { "ronto", UMEASURE_PREFIX_RONTO }, 142 { "quecto", UMEASURE_PREFIX_QUECTO }, 143 // Binary prefixes 144 { "yobi", UMEASURE_PREFIX_YOBI }, 145 { "zebi", UMEASURE_PREFIX_ZEBI }, 146 { "exbi", UMEASURE_PREFIX_EXBI }, 147 { "pebi", UMEASURE_PREFIX_PEBI }, 148 { "tebi", UMEASURE_PREFIX_TEBI }, 149 { "gibi", UMEASURE_PREFIX_GIBI }, 150 { "mebi", UMEASURE_PREFIX_MEBI }, 151 { "kibi", UMEASURE_PREFIX_KIBI }, 152 }; 153 154 /** 155 * A ResourceSink that collects simple unit identifiers from the keys of the 156 * convertUnits table into an array, and adds these values to a TrieBuilder, 157 * with associated values being their index into this array plus a specified 158 * offset. 159 * 160 * Example code: 161 * 162 * UErrorCode status = U_ZERO_ERROR; 163 * BytesTrieBuilder b(status); 164 * int32_t ARR_SIZE = 200; 165 * const char *unitIdentifiers[ARR_SIZE]; 166 * int32_t *unitCategories[ARR_SIZE]; 167 * SimpleUnitIdentifiersSink identifierSink(gSerializedUnitCategoriesTrie, unitIdentifiers, 168 * unitCategories, ARR_SIZE, b, kTrieValueOffset); 169 * LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status)); 170 * ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status); 171 */ 172 class SimpleUnitIdentifiersSink : public icu::ResourceSink { 173 public: 174 /** 175 * Constructor. 176 * @param quantitiesTrieData The data for constructing a quantitiesTrie, 177 * which maps from a simple unit identifier to an index into the 178 * gCategories array. 179 * @param out Array of char* to which pointers to the simple unit 180 * identifiers will be saved. (Does not take ownership.) 181 * @param outCategories Array of int32_t to which category indexes will be 182 * saved: this corresponds to simple unit IDs saved to `out`, mapping 183 * from the ID to the value produced by the quantitiesTrie (which is an 184 * index into the gCategories array). 185 * @param outSize The size of `out` and `outCategories`. 186 * @param trieBuilder The trie builder to which the simple unit identifier 187 * should be added. The trie builder must outlive this resource sink. 188 * @param trieValueOffset This is added to the index of the identifier in 189 * the `out` array, before adding to `trieBuilder` as the value 190 * associated with the identifier. 191 */ 192 explicit SimpleUnitIdentifiersSink(StringPiece quantitiesTrieData, const char **out, 193 int32_t *outCategories, int32_t outSize, 194 BytesTrieBuilder &trieBuilder, int32_t trieValueOffset) 195 : outArray(out), outCategories(outCategories), outSize(outSize), trieBuilder(trieBuilder), 196 trieValueOffset(trieValueOffset), quantitiesTrieData(quantitiesTrieData), outIndex(0) {} 197 198 /** 199 * Adds the table keys found in value to the output vector. 200 * @param key The key of the resource passed to `value`: the second 201 * parameter of the ures_getAllItemsWithFallback() call. 202 * @param value Should be a ResourceTable value, if 203 * ures_getAllItemsWithFallback() was called correctly for this sink. 204 * @param noFallback Ignored. 205 * @param status The standard ICU error code output parameter. 206 */ 207 void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override { 208 ResourceTable table = value.getTable(status); 209 if (U_FAILURE(status)) return; 210 211 if (outIndex + table.getSize() > outSize) { 212 status = U_INDEX_OUTOFBOUNDS_ERROR; 213 return; 214 } 215 216 BytesTrie quantitiesTrie(quantitiesTrieData.data()); 217 218 // Collect keys from the table resource. 219 const char *simpleUnitID; 220 for (int32_t i = 0; table.getKeyAndValue(i, simpleUnitID, value); ++i) { 221 U_ASSERT(i < table.getSize()); 222 U_ASSERT(outIndex < outSize); 223 if (uprv_strcmp(simpleUnitID, "kilogram") == 0) { 224 // For parsing, we use "gram", the prefixless metric mass unit. We 225 // thus ignore the SI Base Unit of Mass: it exists due to being the 226 // mass conversion target unit, but not needed for MeasureUnit 227 // parsing. 228 continue; 229 } 230 outArray[outIndex] = simpleUnitID; 231 trieBuilder.add(simpleUnitID, trieValueOffset + outIndex, status); 232 233 // Find the base target unit for this simple unit 234 ResourceTable table = value.getTable(status); 235 if (U_FAILURE(status)) { return; } 236 if (!table.findValue("target", value)) { 237 status = U_INVALID_FORMAT_ERROR; 238 break; 239 } 240 int32_t len; 241 const char16_t* uTarget = value.getString(len, status); 242 CharString target; 243 target.appendInvariantChars(uTarget, len, status); 244 if (U_FAILURE(status)) { return; } 245 quantitiesTrie.reset(); 246 UStringTrieResult result = quantitiesTrie.next(target.data(), target.length()); 247 if (!USTRINGTRIE_HAS_VALUE(result)) { 248 status = U_INVALID_FORMAT_ERROR; 249 break; 250 } 251 outCategories[outIndex] = quantitiesTrie.getValue(); 252 253 outIndex++; 254 } 255 } 256 257 private: 258 const char **outArray; 259 int32_t *outCategories; 260 int32_t outSize; 261 BytesTrieBuilder &trieBuilder; 262 int32_t trieValueOffset; 263 264 StringPiece quantitiesTrieData; 265 266 int32_t outIndex; 267 }; 268 269 class UnitAliasesSink : public icu::ResourceSink { 270 public: 271 /** 272 * Constructor. 273 * @param unitAliases The output vector of unit alias identifiers (CharString). 274 * @param unitReplacements The output vector of replacements for the unit aliases (CharString). 275 */ 276 explicit UnitAliasesSink(MaybeStackVector<CharString> &unitAliases, 277 MaybeStackVector<CharString> &unitReplacements) 278 : unitAliases(unitAliases), unitReplacements(unitReplacements) {} 279 280 /** 281 * Adds the unit alias key and its replacement to the unitAliases and unitReplacements vectors. 282 * @param key The unit alias identifier (e.g., "meter-and-liter"). 283 * @param value Should be a ResourceTable value containing the replacement, 284 * when ures_getAllChildrenWithFallback() is called correctly for this sink. 285 * @param noFallback Ignored. 286 * @param status The standard ICU error code output parameter. 287 */ 288 void put(const char *key, ResourceValue &value, UBool /*noFallback*/, 289 UErrorCode &status) override { 290 if (U_FAILURE(status)) return; 291 292 // Add the unit alias key to the unitAliases vector 293 int32_t keyLen = static_cast<int32_t>(uprv_strlen(key)); 294 unitAliases.emplaceBackAndCheckErrorCode(status)->append(key, keyLen, status); 295 if (U_FAILURE(status)) { 296 return; 297 } 298 299 // Find the replacement for this unit alias from the alias table resource. 300 ResourceTable aliasTable = value.getTable(status); 301 if (U_FAILURE(status)) { 302 return; 303 } 304 305 if (!aliasTable.findValue("replacement", value)) { 306 status = U_MISSING_RESOURCE_ERROR; 307 return; 308 } 309 310 int32_t len; 311 const char16_t *uReplacement = value.getString(len, status); 312 unitReplacements.emplaceBackAndCheckErrorCode(status)->appendInvariantChars(uReplacement, 313 len, status); 314 } 315 316 private: 317 MaybeStackVector<CharString> &unitAliases; 318 MaybeStackVector<CharString> &unitReplacements; 319 }; 320 321 /** 322 * A ResourceSink that collects information from `unitQuantities` in the `units` 323 * resource to provide key->value lookups from base unit to category, as well as 324 * preserving ordering information for these categories. See `units.txt`. 325 * 326 * For example: "kilogram" -> "mass", "meter-per-second" -> "speed". 327 * 328 * In C++ unitQuantity values are collected in order into a char16_t* array, while 329 * unitQuantity keys are added added to a TrieBuilder, with associated values 330 * being the index into the aforementioned char16_t* array. 331 */ 332 class CategoriesSink : public icu::ResourceSink { 333 public: 334 /** 335 * Constructor. 336 * @param out Array of char16_t* to which unitQuantity values will be saved. 337 * The pointers returned not owned: they point directly at the resource 338 * strings in static memory. 339 * @param outSize The size of the `out` array. 340 * @param trieBuilder The trie builder to which the keys (base units) of 341 * each unitQuantity will be added, each with value being the offset 342 * into `out`. 343 */ 344 explicit CategoriesSink(const char16_t **out, int32_t &outSize, BytesTrieBuilder &trieBuilder) 345 : outQuantitiesArray(out), outSize(outSize), trieBuilder(trieBuilder), outIndex(0) {} 346 347 void put(const char * /*key*/, ResourceValue &value, UBool /*noFallback*/, UErrorCode &status) override { 348 ResourceArray array = value.getArray(status); 349 if (U_FAILURE(status)) { 350 return; 351 } 352 353 if (outIndex + array.getSize() > outSize) { 354 status = U_INDEX_OUTOFBOUNDS_ERROR; 355 return; 356 } 357 358 for (int32_t i = 0; array.getValue(i, value); ++i) { 359 U_ASSERT(outIndex < outSize); 360 ResourceTable table = value.getTable(status); 361 if (U_FAILURE(status)) { 362 return; 363 } 364 if (table.getSize() != 1) { 365 status = U_INVALID_FORMAT_ERROR; 366 return; 367 } 368 const char *key; 369 table.getKeyAndValue(0, key, value); 370 int32_t uTmpLen; 371 outQuantitiesArray[outIndex] = value.getString(uTmpLen, status); 372 trieBuilder.add(key, outIndex, status); 373 outIndex++; 374 } 375 } 376 377 private: 378 const char16_t **outQuantitiesArray; 379 int32_t &outSize; 380 BytesTrieBuilder &trieBuilder; 381 382 int32_t outIndex; 383 }; 384 385 icu::UInitOnce gUnitExtrasInitOnce {}; 386 387 // Array of unit aliases. 388 const char** gUnitReplacements; 389 const char* gUnitReplacementStrings; 390 int32_t gNumUnitReplacements; 391 392 // Array of simple unit IDs. 393 // 394 // The array memory itself is owned by this pointer, but the individual char* in 395 // that array point at static memory. (Note that these char* are also returned 396 // by SingleUnitImpl::getSimpleUnitID().) 397 const char **gSimpleUnits = nullptr; 398 399 // Maps from the value associated with each simple unit ID to an index into the 400 // gCategories array. 401 int32_t *gSimpleUnitCategories = nullptr; 402 403 char *gSerializedUnitExtrasStemTrie = nullptr; 404 405 // Array of char16_t* pointing at the unit categories (aka "quantities", aka 406 // "types"), as found in the `unitQuantities` resource. The array memory itself 407 // is owned by this pointer, but the individual char16_t* in that array point at 408 // static memory. 409 const char16_t **gCategories = nullptr; 410 // Number of items in `gCategories`. 411 int32_t gCategoriesCount = 0; 412 // Serialized BytesTrie for mapping from base units to indices into gCategories. 413 char *gSerializedUnitCategoriesTrie = nullptr; 414 415 UBool U_CALLCONV cleanupUnitExtras() { 416 uprv_free(gSerializedUnitCategoriesTrie); 417 gSerializedUnitCategoriesTrie = nullptr; 418 uprv_free(gCategories); 419 gCategories = nullptr; 420 uprv_free(gSerializedUnitExtrasStemTrie); 421 gSerializedUnitExtrasStemTrie = nullptr; 422 uprv_free(gSimpleUnitCategories); 423 gSimpleUnitCategories = nullptr; 424 uprv_free(gSimpleUnits); 425 gSimpleUnits = nullptr; 426 uprv_free((void*)gUnitReplacementStrings); 427 gUnitReplacementStrings = nullptr; 428 uprv_free(gUnitReplacements); 429 gUnitReplacements = nullptr; 430 gNumUnitReplacements = 0; 431 gUnitExtrasInitOnce.reset(); 432 return true; 433 } 434 435 void U_CALLCONV initUnitExtras(UErrorCode& status) { 436 ucln_i18n_registerCleanup(UCLN_I18N_UNIT_EXTRAS, cleanupUnitExtras); 437 LocalUResourceBundlePointer unitsBundle(ures_openDirect(nullptr, "units", &status)); 438 439 // Collect unitQuantities information into gSerializedUnitCategoriesTrie and gCategories. 440 const char *CATEGORY_TABLE_NAME = "unitQuantities"; 441 LocalUResourceBundlePointer unitQuantities( 442 ures_getByKey(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, nullptr, &status)); 443 if (U_FAILURE(status)) { return; } 444 gCategoriesCount = unitQuantities.getAlias()->fSize; 445 size_t quantitiesMallocSize = sizeof(char16_t *) * gCategoriesCount; 446 gCategories = static_cast<const char16_t **>(uprv_malloc(quantitiesMallocSize)); 447 if (gCategories == nullptr) { 448 status = U_MEMORY_ALLOCATION_ERROR; 449 return; 450 } 451 uprv_memset(gCategories, 0, quantitiesMallocSize); 452 BytesTrieBuilder quantitiesBuilder(status); 453 CategoriesSink categoriesSink(gCategories, gCategoriesCount, quantitiesBuilder); 454 ures_getAllItemsWithFallback(unitsBundle.getAlias(), CATEGORY_TABLE_NAME, categoriesSink, status); 455 StringPiece resultQuantities = quantitiesBuilder.buildStringPiece(USTRINGTRIE_BUILD_FAST, status); 456 if (U_FAILURE(status)) { return; } 457 // Copy the result into the global constant pointer 458 size_t numBytesQuantities = resultQuantities.length(); 459 gSerializedUnitCategoriesTrie = static_cast<char *>(uprv_malloc(numBytesQuantities)); 460 if (gSerializedUnitCategoriesTrie == nullptr) { 461 status = U_MEMORY_ALLOCATION_ERROR; 462 return; 463 } 464 uprv_memcpy(gSerializedUnitCategoriesTrie, resultQuantities.data(), numBytesQuantities); 465 466 // Build the BytesTrie that Parser needs for parsing unit identifiers. 467 468 BytesTrieBuilder b(status); 469 if (U_FAILURE(status)) { return; } 470 471 // Add SI and binary prefixes 472 for (const auto& unitPrefixInfo : gUnitPrefixStrings) { 473 b.add(unitPrefixInfo.string, unitPrefixInfo.value + kPrefixOffset, status); 474 } 475 if (U_FAILURE(status)) { return; } 476 477 // Add syntax parts (compound, power prefixes) 478 b.add("-per-", COMPOUND_PART_PER, status); 479 b.add("-", COMPOUND_PART_TIMES, status); 480 b.add("-and-", COMPOUND_PART_AND, status); 481 b.add("per-", INITIAL_COMPOUND_PART_PER, status); 482 b.add("square-", POWER_PART_P2, status); 483 b.add("cubic-", POWER_PART_P3, status); 484 b.add("pow2-", POWER_PART_P2, status); 485 b.add("pow3-", POWER_PART_P3, status); 486 b.add("pow4-", POWER_PART_P4, status); 487 b.add("pow5-", POWER_PART_P5, status); 488 b.add("pow6-", POWER_PART_P6, status); 489 b.add("pow7-", POWER_PART_P7, status); 490 b.add("pow8-", POWER_PART_P8, status); 491 b.add("pow9-", POWER_PART_P9, status); 492 b.add("pow10-", POWER_PART_P10, status); 493 b.add("pow11-", POWER_PART_P11, status); 494 b.add("pow12-", POWER_PART_P12, status); 495 b.add("pow13-", POWER_PART_P13, status); 496 b.add("pow14-", POWER_PART_P14, status); 497 b.add("pow15-", POWER_PART_P15, status); 498 if (U_FAILURE(status)) { return; } 499 500 // Add sanctioned simple units by offset: simple units all have entries in 501 // units/convertUnits resources. 502 LocalUResourceBundlePointer convertUnits( 503 ures_getByKey(unitsBundle.getAlias(), "convertUnits", nullptr, &status)); 504 if (U_FAILURE(status)) { return; } 505 506 // Allocate enough space: with identifierSink below skipping kilogram, we're 507 // probably allocating one more than needed. 508 int32_t simpleUnitsCount = convertUnits.getAlias()->fSize; 509 int32_t arrayMallocSize = sizeof(char *) * simpleUnitsCount; 510 gSimpleUnits = static_cast<const char **>(uprv_malloc(arrayMallocSize)); 511 if (gSimpleUnits == nullptr) { 512 status = U_MEMORY_ALLOCATION_ERROR; 513 return; 514 } 515 uprv_memset(gSimpleUnits, 0, arrayMallocSize); 516 arrayMallocSize = sizeof(int32_t) * simpleUnitsCount; 517 gSimpleUnitCategories = static_cast<int32_t *>(uprv_malloc(arrayMallocSize)); 518 if (gSimpleUnitCategories == nullptr) { 519 status = U_MEMORY_ALLOCATION_ERROR; 520 return; 521 } 522 uprv_memset(gSimpleUnitCategories, 0, arrayMallocSize); 523 524 // Populate gSimpleUnits and build the associated trie. 525 SimpleUnitIdentifiersSink identifierSink(resultQuantities, gSimpleUnits, gSimpleUnitCategories, 526 simpleUnitsCount, b, kSimpleUnitOffset); 527 ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status); 528 529 // Populate gUnitReplacements and its associated data structures. 530 LocalUResourceBundlePointer aliasBundle(ures_open(U_ICUDATA_ALIAS, "metadata", &status)); 531 if (U_FAILURE(status)) { 532 return; 533 } 534 MaybeStackVector<CharString> unitAliases; 535 MaybeStackVector<CharString> unitReplacements; 536 537 UnitAliasesSink aliasSink(unitAliases, unitReplacements); 538 ures_getAllChildrenWithFallback(aliasBundle.getAlias(), "alias/unit", aliasSink, status); 539 if (U_FAILURE(status)) { 540 return; 541 } 542 543 for (int32_t i = 0; i < unitAliases.length(); i++) { 544 b.add(unitAliases[i]->data(), i + kAliasOffset, status); 545 if (U_FAILURE(status)) { 546 return; 547 } 548 } 549 550 int32_t unitReplacementStringLength = 0; 551 for (int32_t i = 0; i < unitReplacements.length(); i++) { 552 unitReplacementStringLength += unitReplacements[i]->length() + 1; 553 } 554 gUnitReplacementStrings = (const char*)uprv_malloc(unitReplacementStringLength * sizeof(char)); 555 gUnitReplacements = (const char**)uprv_malloc(unitReplacements.length() * sizeof(const char**)); 556 if (gUnitReplacementStrings == nullptr || gUnitReplacements == nullptr) { 557 status = U_MEMORY_ALLOCATION_ERROR; 558 return; 559 } 560 gNumUnitReplacements = unitReplacements.length(); 561 char* p = const_cast<char*>(gUnitReplacementStrings); 562 for (int32_t i = 0; i < unitReplacements.length(); i++) { 563 gUnitReplacements[i] = p; 564 uprv_strcpy(p, unitReplacements[i]->data()); 565 p += unitReplacements[i]->length() + 1; 566 } 567 568 // Build the CharsTrie 569 // TODO: Use SLOW or FAST here? 570 StringPiece result = b.buildStringPiece(USTRINGTRIE_BUILD_FAST, status); 571 if (U_FAILURE(status)) { return; } 572 573 // Copy the result into the global constant pointer 574 size_t numBytes = result.length(); 575 gSerializedUnitExtrasStemTrie = static_cast<char *>(uprv_malloc(numBytes)); 576 if (gSerializedUnitExtrasStemTrie == nullptr) { 577 status = U_MEMORY_ALLOCATION_ERROR; 578 return; 579 } 580 uprv_memcpy(gSerializedUnitExtrasStemTrie, result.data(), numBytes); 581 } 582 583 class Token { 584 public: 585 Token(int64_t match) : fMatch(match) { 586 if (fMatch < kCompoundPartOffset) { 587 this->fType = TYPE_PREFIX; 588 } else if (fMatch < kInitialCompoundPartOffset) { 589 this->fType = TYPE_COMPOUND_PART; 590 } else if (fMatch < kPowerPartOffset) { 591 this->fType = TYPE_INITIAL_COMPOUND_PART; 592 } else if (fMatch < kSimpleUnitOffset) { 593 this->fType = TYPE_POWER_PART; 594 } else if (fMatch < kAliasOffset) { 595 this->fType = TYPE_SIMPLE_UNIT; 596 } else { 597 this->fType = TYPE_ALIAS; 598 } 599 } 600 601 static Token constantToken(StringPiece str, UErrorCode &status) { 602 Token result; 603 auto value = Token::parseStringToLong(str, status); 604 if (U_FAILURE(status)) { 605 return result; 606 } 607 result.fMatch = value; 608 result.fType = TYPE_CONSTANT_DENOMINATOR; 609 return result; 610 } 611 612 enum Type { 613 TYPE_UNDEFINED, 614 TYPE_PREFIX, 615 // Token type for "-per-", "-", and "-and-". 616 TYPE_COMPOUND_PART, 617 // Token type for "per-". 618 TYPE_INITIAL_COMPOUND_PART, 619 TYPE_POWER_PART, 620 TYPE_SIMPLE_UNIT, 621 TYPE_CONSTANT_DENOMINATOR, 622 TYPE_ALIAS, 623 }; 624 625 // Calling getType() is invalid, resulting in an assertion failure, if Token 626 // value isn't positive. 627 Type getType() const { 628 U_ASSERT(fMatch >= 0); 629 return this->fType; 630 } 631 632 // Retrieve the value of the constant denominator if the token is of type TYPE_CONSTANT_DENOMINATOR. 633 uint64_t getConstantDenominator() const { 634 U_ASSERT(getType() == TYPE_CONSTANT_DENOMINATOR); 635 return static_cast<uint64_t>(fMatch); 636 } 637 638 UMeasurePrefix getUnitPrefix() const { 639 U_ASSERT(getType() == TYPE_PREFIX); 640 return static_cast<UMeasurePrefix>(fMatch - kPrefixOffset); 641 } 642 643 // Valid only for tokens with type TYPE_COMPOUND_PART. 644 int32_t getMatch() const { 645 U_ASSERT(getType() == TYPE_COMPOUND_PART); 646 return fMatch; 647 } 648 649 int32_t getInitialCompoundPart() const { 650 // Even if there is only one InitialCompoundPart value, we have this 651 // function for the simplicity of code consistency. 652 U_ASSERT(getType() == TYPE_INITIAL_COMPOUND_PART); 653 // Defensive: if this assert fails, code using this function also needs 654 // to change. 655 U_ASSERT(fMatch == INITIAL_COMPOUND_PART_PER); 656 return fMatch; 657 } 658 659 int8_t getPower() const { 660 U_ASSERT(getType() == TYPE_POWER_PART); 661 return static_cast<int8_t>(fMatch - kPowerPartOffset); 662 } 663 664 int32_t getSimpleUnitIndex() const { 665 U_ASSERT(getType() == TYPE_SIMPLE_UNIT); 666 return fMatch - kSimpleUnitOffset; 667 } 668 669 int32_t getAliasIndex() const { 670 U_ASSERT(getType() == TYPE_ALIAS); 671 return static_cast<int32_t>(fMatch - kAliasOffset); 672 } 673 674 // TODO: Consider moving this to a separate utility class. 675 // Utility function to parse a string into an unsigned long value. 676 // The value must be a positive integer within the range [1, INT64_MAX]. 677 // The input can be in integer or scientific notation. 678 static uint64_t parseStringToLong(const StringPiece strNum, UErrorCode &status) { 679 // We are processing well-formed input, so we don't need any special options to 680 // StringToDoubleConverter. 681 StringToDoubleConverter converter(0, 0, 0, "", ""); 682 int32_t count; 683 double double_result = converter.StringToDouble(strNum.data(), strNum.length(), &count); 684 if (count != strNum.length()) { 685 status = kUnitIdentifierSyntaxError; 686 return 0; 687 } 688 689 if (U_FAILURE(status) || double_result < 1.0 || double_result > static_cast<double>(INT64_MAX)) { 690 status = kUnitIdentifierSyntaxError; 691 return 0; 692 } 693 694 // Check if the value is integer. 695 uint64_t int_result = static_cast<uint64_t>(double_result); 696 const double kTolerance = 1e-9; 697 if (abs(double_result - int_result) > kTolerance) { 698 status = kUnitIdentifierSyntaxError; 699 return 0; 700 } 701 702 return int_result; 703 } 704 705 private: 706 Token() = default; 707 int64_t fMatch; 708 Type fType = TYPE_UNDEFINED; 709 }; 710 711 class Parser { 712 public: 713 /** 714 * Factory function for parsing the given identifier. 715 * 716 * @param source The identifier to parse. This function does not make a copy 717 * of source: the underlying string that source points at, must outlive the 718 * parser. 719 * @param status ICU error code. 720 */ 721 static Parser from(StringPiece source, UErrorCode& status) { 722 if (U_FAILURE(status)) { 723 return {}; 724 } 725 umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status); 726 if (U_FAILURE(status)) { 727 return {}; 728 } 729 return {source}; 730 } 731 732 /** 733 * A single unit or a constant denominator. 734 */ 735 struct SingleUnitOrConstant { 736 enum ValueType { 737 kSingleUnit, 738 kConstantDenominator, 739 }; 740 741 ValueType type = kSingleUnit; 742 SingleUnitImpl singleUnit; 743 uint64_t constantDenominator; 744 745 static SingleUnitOrConstant singleUnitValue(SingleUnitImpl singleUnit) { 746 SingleUnitOrConstant result; 747 result.type = kSingleUnit; 748 result.singleUnit = singleUnit; 749 result.constantDenominator = 0; 750 return result; 751 } 752 753 static SingleUnitOrConstant constantDenominatorValue(uint64_t constant) { 754 SingleUnitOrConstant result; 755 result.type = kConstantDenominator; 756 result.singleUnit = {}; 757 result.constantDenominator = constant; 758 return result; 759 } 760 761 uint64_t getConstantDenominator() const { 762 U_ASSERT(type == kConstantDenominator); 763 return constantDenominator; 764 } 765 766 SingleUnitImpl getSingleUnit() const { 767 U_ASSERT(type == kSingleUnit); 768 return singleUnit; 769 } 770 771 bool isSingleUnit() const { return type == kSingleUnit; } 772 773 bool isConstantDenominator() const { return type == kConstantDenominator; } 774 }; 775 776 MeasureUnitImpl parse(UErrorCode& status) { 777 MeasureUnitImpl result; 778 779 if (U_FAILURE(status)) { 780 return result; 781 } 782 if (fSource.empty()) { 783 // The dimenionless unit: nothing to parse. leave result as is. 784 return result; 785 } 786 787 while (hasNext()) { 788 bool sawAnd = false; 789 790 auto singleUnitOrConstant = nextSingleUnitOrConstant(sawAnd, status); 791 if (U_FAILURE(status)) { 792 return result; 793 } 794 795 if (singleUnitOrConstant.isConstantDenominator()) { 796 if (result.constantDenominator > 0) { 797 status = kUnitIdentifierSyntaxError; 798 return result; 799 } 800 result.constantDenominator = singleUnitOrConstant.getConstantDenominator(); 801 result.complexity = UMEASURE_UNIT_COMPOUND; 802 continue; 803 } 804 805 U_ASSERT(singleUnitOrConstant.isSingleUnit()); 806 bool added = result.appendSingleUnit(singleUnitOrConstant.getSingleUnit(), status); 807 if (U_FAILURE(status)) { 808 return result; 809 } 810 811 if (sawAnd && !added) { 812 // Two similar units are not allowed in a mixed unit. 813 status = kUnitIdentifierSyntaxError; 814 return result; 815 } 816 817 if (result.singleUnits.length() >= 2) { 818 // nextSingleUnit fails appropriately for "per" and "and" in the 819 // same identifier. It doesn't fail for other compound units 820 // (COMPOUND_PART_TIMES). Consequently we take care of that 821 // here. 822 UMeasureUnitComplexity complexity = 823 sawAnd ? UMEASURE_UNIT_MIXED : UMEASURE_UNIT_COMPOUND; 824 if (result.singleUnits.length() == 2) { 825 // After appending two singleUnits, the complexity will be `UMEASURE_UNIT_COMPOUND` 826 U_ASSERT(result.complexity == UMEASURE_UNIT_COMPOUND); 827 result.complexity = complexity; 828 } else if (result.complexity != complexity) { 829 // Can't have mixed compound units 830 status = kUnitIdentifierSyntaxError; 831 return result; 832 } 833 } 834 } 835 836 if (result.singleUnits.length() == 0) { 837 // The identifier was empty or only had a constant denominator. 838 status = kUnitIdentifierSyntaxError; 839 return result; // add it for code consistency. 840 } 841 842 return result; 843 } 844 845 private: 846 // Tracks parser progress: the offset into fSource. 847 int32_t fIndex = 0; 848 849 // Since we're not owning this memory, whatever is passed to the constructor 850 // should live longer than this Parser - and the parser shouldn't return any 851 // references to that string. 852 StringPiece fSource; 853 BytesTrie fTrie; 854 855 // Storage for modified source string when aliases are expanded 856 CharString fModifiedSource; 857 858 // Set to true when we've seen a "-per-" or a "per-", after which all units 859 // are in the denominator. Until we find an "-and-", at which point the 860 // identifier is invalid pending TODO(CLDR-13701). 861 bool fAfterPer = false; 862 863 // Set to true when we've just seen a "per-". This is used to determine if 864 // the next token can be a constant denominator token. 865 bool fJustSawPer = false; 866 867 Parser() : fSource(""), fTrie(u"") {} 868 869 Parser(StringPiece source) 870 : fSource(source), fTrie(gSerializedUnitExtrasStemTrie) {} 871 872 inline bool hasNext() const { 873 return fIndex < fSource.length(); 874 } 875 876 // Returns the next Token parsed from fSource, advancing fIndex to the end 877 // of that token in fSource. In case of U_FAILURE(status), the token 878 // returned will cause an abort if getType() is called on it. 879 Token nextToken(UErrorCode& status) { 880 fTrie.reset(); 881 int32_t match = -1; 882 // Saves the position in the fSource string for the end of the most 883 // recent matching token. 884 int32_t previ = -1; 885 886 // Saves the position in the fSource string for later use in case of unit constant found. 887 int32_t currentFIndex = fIndex; 888 889 // Find the longest token that matches a value in the trie: 890 while (fIndex < fSource.length()) { 891 auto result = fTrie.next(fSource.data()[fIndex++]); 892 if (result == USTRINGTRIE_NO_MATCH) { 893 break; 894 } else if (result == USTRINGTRIE_NO_VALUE) { 895 continue; 896 } 897 U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); 898 match = fTrie.getValue(); 899 previ = fIndex; 900 if (result == USTRINGTRIE_FINAL_VALUE) { 901 break; 902 } 903 U_ASSERT(result == USTRINGTRIE_INTERMEDIATE_VALUE); 904 // continue; 905 } 906 907 if (match >= 0) { 908 fIndex = previ; 909 return {match}; 910 } 911 912 // If no match was found, we check if the token is a constant denominator. 913 // 1. We find the index of the start of the next token or the end of the string. 914 int32_t endOfConstantIndex = fSource.find("-", currentFIndex); 915 endOfConstantIndex = (endOfConstantIndex == -1) ? fSource.length() : endOfConstantIndex; 916 if (endOfConstantIndex <= currentFIndex) { 917 status = kUnitIdentifierSyntaxError; 918 return {match}; 919 } 920 921 // 2. We extract the substring from the start of the constant to the end of the constant. 922 StringPiece constantDenominatorStr = 923 fSource.substr(currentFIndex, endOfConstantIndex - currentFIndex); 924 fIndex = endOfConstantIndex; 925 return Token::constantToken(constantDenominatorStr, status); 926 } 927 928 /** 929 * Returns the next "single unit" via result. 930 * 931 * If a "-per-" was parsed, the result will have appropriate negative 932 * dimensionality. 933 * 934 * Returns an error if we parse both compound units and "-and-", since mixed 935 * compound units are not yet supported - TODO(CLDR-13701). 936 * 937 * @param result Will be overwritten by the result, if status shows success. 938 * @param sawAnd If an "-and-" was parsed prior to finding the "single 939 * unit", sawAnd is set to true. If not, it is left as is. 940 * @param status ICU error code. 941 */ 942 SingleUnitOrConstant nextSingleUnitOrConstant(bool &sawAnd, UErrorCode &status) { 943 SingleUnitImpl singleUnitResult; 944 if (U_FAILURE(status)) { 945 return {}; 946 } 947 948 // state: 949 // 0 = no tokens seen yet (will accept power, SI or binary prefix, or simple unit) 950 // 1 = power token seen (will not accept another power token) 951 // 2 = SI or binary prefix token seen (will not accept a power, or SI or binary prefix token) 952 int32_t state = 0; 953 954 bool atStart = fIndex == 0; 955 Token token = nextToken(status); 956 if (U_FAILURE(status)) { 957 return {}; 958 } 959 960 // Handles the case where the alias replacement begins with "per-". 961 // For example: 962 // if the alias is "permeter" and the replacement is "per-meter". 963 // NOTE: This case does not currently exist in CLDR, but this code anticipates possible future 964 // additions. 965 if (token.getType() == Token::TYPE_ALIAS) { 966 processAlias(token, status); 967 token = nextToken(status); 968 if (U_FAILURE(status)) { 969 return {}; 970 } 971 } 972 973 fJustSawPer = false; 974 975 if (atStart) { 976 // Identifiers optionally start with "per-". 977 if (token.getType() == Token::TYPE_INITIAL_COMPOUND_PART) { 978 U_ASSERT(token.getInitialCompoundPart() == INITIAL_COMPOUND_PART_PER); 979 fAfterPer = true; 980 fJustSawPer = true; 981 singleUnitResult.dimensionality = -1; 982 983 token = nextToken(status); 984 if (U_FAILURE(status)) { 985 return {}; 986 } 987 } 988 } else { 989 // All other SingleUnit's are separated from previous SingleUnit's 990 // via a compound part: 991 if (token.getType() != Token::TYPE_COMPOUND_PART) { 992 status = kUnitIdentifierSyntaxError; 993 return {}; 994 } 995 996 switch (token.getMatch()) { 997 case COMPOUND_PART_PER: 998 if (sawAnd) { 999 // Mixed compound units not yet supported, 1000 // TODO(CLDR-13701). 1001 status = kUnitIdentifierSyntaxError; 1002 return {}; 1003 } 1004 fAfterPer = true; 1005 fJustSawPer = true; 1006 singleUnitResult.dimensionality = -1; 1007 break; 1008 1009 case COMPOUND_PART_TIMES: 1010 if (fAfterPer) { 1011 singleUnitResult.dimensionality = -1; 1012 } 1013 break; 1014 1015 case COMPOUND_PART_AND: 1016 if (fAfterPer) { 1017 // Can't start with "-and-", and mixed compound units 1018 // not yet supported, TODO(CLDR-13701). 1019 status = kUnitIdentifierSyntaxError; 1020 return {}; 1021 } 1022 sawAnd = true; 1023 break; 1024 } 1025 1026 token = nextToken(status); 1027 if (U_FAILURE(status)) { 1028 return {}; 1029 } 1030 } 1031 1032 if (token.getType() == Token::TYPE_CONSTANT_DENOMINATOR) { 1033 if (!fJustSawPer) { 1034 status = kUnitIdentifierSyntaxError; 1035 return {}; 1036 } 1037 1038 return SingleUnitOrConstant::constantDenominatorValue(token.getConstantDenominator()); 1039 } 1040 1041 // Read tokens until we have a complete SingleUnit or we reach the end. 1042 while (true) { 1043 switch (token.getType()) { 1044 case Token::TYPE_POWER_PART: 1045 if (state > 0) { 1046 status = kUnitIdentifierSyntaxError; 1047 return {}; 1048 } 1049 singleUnitResult.dimensionality *= token.getPower(); 1050 state = 1; 1051 break; 1052 1053 case Token::TYPE_PREFIX: 1054 if (state > 1) { 1055 status = kUnitIdentifierSyntaxError; 1056 return {}; 1057 } 1058 singleUnitResult.unitPrefix = token.getUnitPrefix(); 1059 state = 2; 1060 break; 1061 1062 case Token::TYPE_SIMPLE_UNIT: 1063 singleUnitResult.index = token.getSimpleUnitIndex(); 1064 break; 1065 1066 case Token::TYPE_ALIAS: 1067 processAlias(token, status); 1068 break; 1069 1070 default: 1071 status = kUnitIdentifierSyntaxError; 1072 return {}; 1073 } 1074 1075 if (token.getType() == Token::TYPE_SIMPLE_UNIT) { 1076 break; 1077 } 1078 1079 if (!hasNext()) { 1080 // We ran out of tokens before finding a complete single unit. 1081 status = kUnitIdentifierSyntaxError; 1082 return {}; 1083 } 1084 token = nextToken(status); 1085 if (U_FAILURE(status)) { 1086 return {}; 1087 } 1088 } 1089 1090 return SingleUnitOrConstant::singleUnitValue(singleUnitResult); 1091 } 1092 1093 private: 1094 /** 1095 * Helper function to process alias replacement. 1096 * 1097 * @param token The token of TYPE_ALIAS to process 1098 * @param status ICU error code 1099 */ 1100 void processAlias(const Token &token, UErrorCode &status) { 1101 if (U_FAILURE(status)) { 1102 return; 1103 } 1104 1105 auto aliasIndex = token.getAliasIndex(); 1106 if (aliasIndex < 0 || aliasIndex >= gNumUnitReplacements) { 1107 status = kUnitIdentifierSyntaxError; 1108 return; 1109 } 1110 const char* replacement = gUnitReplacements[aliasIndex]; 1111 1112 // Create new source string: replacement + remaining unparsed portion 1113 fModifiedSource.clear(); 1114 fModifiedSource.append(StringPiece(replacement), status); 1115 1116 // Add the remaining unparsed portion of fSource which starts from fIndex 1117 if (fIndex < fSource.length()) { 1118 StringPiece remaining = fSource.substr(fIndex); 1119 fModifiedSource.append(remaining.data(), remaining.length(), status); 1120 } 1121 1122 if (U_FAILURE(status)) { 1123 return; 1124 } 1125 1126 // Update parser state with new source and reset index 1127 fSource = StringPiece(fModifiedSource.data(), fModifiedSource.length()); 1128 fIndex = 0; 1129 1130 return; 1131 } 1132 }; 1133 1134 // Sorting function wrapping SingleUnitImpl::compareTo for use with uprv_sortArray. 1135 int32_t U_CALLCONV 1136 compareSingleUnits(const void* /*context*/, const void* left, const void* right) { 1137 const auto* realLeft = static_cast<const SingleUnitImpl* const*>(left); 1138 const auto* realRight = static_cast<const SingleUnitImpl* const*>(right); 1139 return (*realLeft)->compareTo(**realRight); 1140 } 1141 1142 // Returns an index into the gCategories array, for the "unitQuantity" (aka 1143 // "type" or "category") associated with the given base unit identifier. Returns 1144 // -1 on failure, together with U_UNSUPPORTED_ERROR. 1145 int32_t getUnitCategoryIndex(BytesTrie &trie, StringPiece baseUnitIdentifier, UErrorCode &status) { 1146 UStringTrieResult result = trie.reset().next(baseUnitIdentifier.data(), baseUnitIdentifier.length()); 1147 if (!USTRINGTRIE_HAS_VALUE(result)) { 1148 status = U_UNSUPPORTED_ERROR; 1149 return -1; 1150 } 1151 1152 return trie.getValue(); 1153 } 1154 1155 } // namespace 1156 1157 U_CAPI int32_t U_EXPORT2 1158 umeas_getPrefixPower(UMeasurePrefix unitPrefix) { 1159 if (unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_BIN && 1160 unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_BIN) { 1161 return unitPrefix - UMEASURE_PREFIX_INTERNAL_ONE_BIN; 1162 } 1163 U_ASSERT(unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_SI && 1164 unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_SI); 1165 return unitPrefix - UMEASURE_PREFIX_ONE; 1166 } 1167 1168 U_CAPI int32_t U_EXPORT2 1169 umeas_getPrefixBase(UMeasurePrefix unitPrefix) { 1170 if (unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_BIN && 1171 unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_BIN) { 1172 return 1024; 1173 } 1174 U_ASSERT(unitPrefix >= UMEASURE_PREFIX_INTERNAL_MIN_SI && 1175 unitPrefix <= UMEASURE_PREFIX_INTERNAL_MAX_SI); 1176 return 10; 1177 } 1178 1179 CharString U_I18N_API getUnitQuantity(const MeasureUnitImpl &baseMeasureUnitImpl, UErrorCode &status) { 1180 CharString result; 1181 MeasureUnitImpl baseUnitImpl = baseMeasureUnitImpl.copy(status); 1182 UErrorCode localStatus = U_ZERO_ERROR; 1183 umtx_initOnce(gUnitExtrasInitOnce, &initUnitExtras, status); 1184 if (U_FAILURE(status)) { 1185 return result; 1186 } 1187 BytesTrie trie(gSerializedUnitCategoriesTrie); 1188 1189 baseUnitImpl.serialize(status); 1190 StringPiece identifier = baseUnitImpl.identifier.data(); 1191 int32_t idx = getUnitCategoryIndex(trie, identifier, localStatus); 1192 if (U_FAILURE(status)) { 1193 return result; 1194 } 1195 1196 // In case the base unit identifier did not match any entry. 1197 if (U_FAILURE(localStatus)) { 1198 localStatus = U_ZERO_ERROR; 1199 baseUnitImpl.takeReciprocal(status); 1200 baseUnitImpl.serialize(status); 1201 identifier.set(baseUnitImpl.identifier.data()); 1202 idx = getUnitCategoryIndex(trie, identifier, localStatus); 1203 1204 if (U_FAILURE(status)) { 1205 return result; 1206 } 1207 } 1208 1209 // In case the reciprocal of the base unit identifier did not match any entry. 1210 MeasureUnitImpl simplifiedUnit = baseMeasureUnitImpl.copyAndSimplify(status); 1211 if (U_FAILURE(status)) { 1212 return result; 1213 } 1214 if (U_FAILURE(localStatus)) { 1215 localStatus = U_ZERO_ERROR; 1216 simplifiedUnit.serialize(status); 1217 identifier.set(simplifiedUnit.identifier.data()); 1218 idx = getUnitCategoryIndex(trie, identifier, localStatus); 1219 1220 if (U_FAILURE(status)) { 1221 return result; 1222 } 1223 } 1224 1225 // In case the simplified base unit identifier did not match any entry. 1226 if (U_FAILURE(localStatus)) { 1227 localStatus = U_ZERO_ERROR; 1228 simplifiedUnit.takeReciprocal(status); 1229 simplifiedUnit.serialize(status); 1230 identifier.set(simplifiedUnit.identifier.data()); 1231 idx = getUnitCategoryIndex(trie, identifier, localStatus); 1232 1233 if (U_FAILURE(status)) { 1234 return result; 1235 } 1236 } 1237 1238 // If there is no match at all, throw an exception. 1239 if (U_FAILURE(localStatus)) { 1240 status = U_INVALID_FORMAT_ERROR; 1241 return result; 1242 } 1243 1244 if (idx < 0 || idx >= gCategoriesCount) { 1245 status = U_INVALID_FORMAT_ERROR; 1246 return result; 1247 } 1248 1249 result.appendInvariantChars(gCategories[idx], u_strlen(gCategories[idx]), status); 1250 return result; 1251 } 1252 1253 // In ICU4J, this is MeasureUnit.getSingleUnitImpl(). 1254 SingleUnitImpl SingleUnitImpl::forMeasureUnit(const MeasureUnit& measureUnit, UErrorCode& status) { 1255 MeasureUnitImpl temp; 1256 const MeasureUnitImpl& impl = MeasureUnitImpl::forMeasureUnit(measureUnit, temp, status); 1257 if (U_FAILURE(status)) { 1258 return {}; 1259 } 1260 if (impl.singleUnits.length() == 0) { 1261 return {}; 1262 } 1263 if (impl.singleUnits.length() == 1) { 1264 return *impl.singleUnits[0]; 1265 } 1266 status = U_ILLEGAL_ARGUMENT_ERROR; 1267 return {}; 1268 } 1269 1270 MeasureUnit SingleUnitImpl::build(UErrorCode& status) const { 1271 MeasureUnitImpl temp; 1272 temp.appendSingleUnit(*this, status); 1273 // TODO(icu-units#28): the MeasureUnitImpl::build() method uses 1274 // findBySubtype, which is relatively slow. 1275 // - At the time of loading the simple unit IDs, we could also save a 1276 // mapping to the builtin MeasureUnit type and subtype they correspond to. 1277 // - This method could then check dimensionality and index, and if both are 1278 // 1, directly return MeasureUnit instances very quickly. 1279 return std::move(temp).build(status); 1280 } 1281 1282 const char *SingleUnitImpl::getSimpleUnitID() const { 1283 return gSimpleUnits[index]; 1284 } 1285 1286 void SingleUnitImpl::appendNeutralIdentifier(CharString &result, UErrorCode &status) const UPRV_NO_SANITIZE_UNDEFINED { 1287 int32_t absPower = std::abs(this->dimensionality); 1288 1289 U_ASSERT(absPower > 0); // "this function does not support the dimensionless single units"; 1290 1291 if (absPower == 1) { 1292 // no-op 1293 } else if (absPower == 2) { 1294 result.append(StringPiece("square-"), status); 1295 } else if (absPower == 3) { 1296 result.append(StringPiece("cubic-"), status); 1297 } else if (absPower <= 15) { 1298 result.append(StringPiece("pow"), status); 1299 result.appendNumber(absPower, status); 1300 result.append(StringPiece("-"), status); 1301 } else { 1302 status = U_ILLEGAL_ARGUMENT_ERROR; // Unit Identifier Syntax Error 1303 return; 1304 } 1305 1306 if (U_FAILURE(status)) { 1307 return; 1308 } 1309 1310 if (this->unitPrefix != UMEASURE_PREFIX_ONE) { 1311 bool found = false; 1312 for (const auto &unitPrefixInfo : gUnitPrefixStrings) { 1313 // TODO: consider using binary search? If we do this, add a unit 1314 // test to ensure gUnitPrefixStrings is sorted? 1315 if (unitPrefixInfo.value == this->unitPrefix) { 1316 result.append(unitPrefixInfo.string, status); 1317 found = true; 1318 break; 1319 } 1320 } 1321 if (!found) { 1322 status = U_UNSUPPORTED_ERROR; 1323 return; 1324 } 1325 } 1326 1327 result.append(StringPiece(this->getSimpleUnitID()), status); 1328 } 1329 1330 int32_t SingleUnitImpl::getUnitCategoryIndex() const { 1331 return gSimpleUnitCategories[index]; 1332 } 1333 1334 MeasureUnitImpl::MeasureUnitImpl(const SingleUnitImpl &singleUnit, UErrorCode &status) { 1335 this->appendSingleUnit(singleUnit, status); 1336 } 1337 1338 MeasureUnitImpl MeasureUnitImpl::forIdentifier(StringPiece identifier, UErrorCode& status) { 1339 return Parser::from(identifier, status).parse(status); 1340 } 1341 1342 const MeasureUnitImpl& MeasureUnitImpl::forMeasureUnit( 1343 const MeasureUnit& measureUnit, MeasureUnitImpl& memory, UErrorCode& status) { 1344 if (measureUnit.fImpl) { 1345 return *measureUnit.fImpl; 1346 } else { 1347 memory = Parser::from(measureUnit.getIdentifier(), status).parse(status); 1348 return memory; 1349 } 1350 } 1351 1352 MeasureUnitImpl MeasureUnitImpl::forMeasureUnitMaybeCopy( 1353 const MeasureUnit& measureUnit, UErrorCode& status) { 1354 if (measureUnit.fImpl) { 1355 return measureUnit.fImpl->copy(status); 1356 } else { 1357 return Parser::from(measureUnit.getIdentifier(), status).parse(status); 1358 } 1359 } 1360 1361 void MeasureUnitImpl::takeReciprocal(UErrorCode& /*status*/) { 1362 identifier.clear(); 1363 for (int32_t i = 0; i < singleUnits.length(); i++) { 1364 singleUnits[i]->dimensionality *= -1; 1365 } 1366 } 1367 1368 MeasureUnitImpl MeasureUnitImpl::copyAndSimplify(UErrorCode &status) const { 1369 MeasureUnitImpl result; 1370 for (int32_t i = 0; i < singleUnits.length(); i++) { 1371 const SingleUnitImpl &singleUnit = *this->singleUnits[i]; 1372 1373 // The following `for` loop will cause time complexity to be O(n^2). 1374 // However, n is very small (number of units, generally, at maximum equal to 10) 1375 bool unitExist = false; 1376 for (int32_t j = 0; j < result.singleUnits.length(); j++) { 1377 if (uprv_strcmp(result.singleUnits[j]->getSimpleUnitID(), singleUnit.getSimpleUnitID()) == 1378 0 && 1379 result.singleUnits[j]->unitPrefix == singleUnit.unitPrefix) { 1380 unitExist = true; 1381 result.singleUnits[j]->dimensionality = 1382 result.singleUnits[j]->dimensionality + singleUnit.dimensionality; 1383 break; 1384 } 1385 } 1386 1387 if (!unitExist) { 1388 result.appendSingleUnit(singleUnit, status); 1389 } 1390 } 1391 1392 return result; 1393 } 1394 1395 bool MeasureUnitImpl::appendSingleUnit(const SingleUnitImpl &singleUnit, UErrorCode &status) { 1396 identifier.clear(); 1397 1398 if (singleUnit.isDimensionless()) { 1399 // Do not append dimensionless units. 1400 return false; 1401 } 1402 1403 // Find a similar unit that already exists, to attempt to coalesce 1404 SingleUnitImpl *oldUnit = nullptr; 1405 for (int32_t i = 0; i < this->singleUnits.length(); i++) { 1406 auto *candidate = this->singleUnits[i]; 1407 if (candidate->isCompatibleWith(singleUnit)) { 1408 oldUnit = candidate; 1409 } 1410 } 1411 1412 if (oldUnit) { 1413 // Both dimensionalities will be positive, or both will be negative, by 1414 // virtue of isCompatibleWith(). 1415 oldUnit->dimensionality += singleUnit.dimensionality; 1416 1417 return false; 1418 } 1419 1420 // Add a copy of singleUnit 1421 // NOTE: MaybeStackVector::emplaceBackAndCheckErrorCode creates new copy of singleUnit. 1422 this->singleUnits.emplaceBackAndCheckErrorCode(status, singleUnit); 1423 if (U_FAILURE(status)) { 1424 return false; 1425 } 1426 1427 // If the MeasureUnitImpl is `UMEASURE_UNIT_SINGLE` and after the appending a unit, the `singleUnits` 1428 // contains more than one. thus means the complexity should be `UMEASURE_UNIT_COMPOUND` 1429 if (this->singleUnits.length() > 1 && 1430 this->complexity == UMeasureUnitComplexity::UMEASURE_UNIT_SINGLE) { 1431 this->complexity = UMeasureUnitComplexity::UMEASURE_UNIT_COMPOUND; 1432 } 1433 1434 return true; 1435 } 1436 1437 MaybeStackVector<MeasureUnitImplWithIndex> 1438 MeasureUnitImpl::extractIndividualUnitsWithIndices(UErrorCode &status) const { 1439 MaybeStackVector<MeasureUnitImplWithIndex> result; 1440 1441 if (this->complexity != UMeasureUnitComplexity::UMEASURE_UNIT_MIXED) { 1442 result.emplaceBackAndCheckErrorCode(status, 0, *this, status); 1443 return result; 1444 } 1445 1446 for (int32_t i = 0; i < singleUnits.length(); ++i) { 1447 result.emplaceBackAndCheckErrorCode(status, i, *singleUnits[i], status); 1448 if (U_FAILURE(status)) { 1449 return result; 1450 } 1451 } 1452 1453 return result; 1454 } 1455 1456 int32_t countCharacter(const CharString &str, char c) { 1457 int32_t count = 0; 1458 for (int32_t i = 0, n = str.length(); i < n; i++) { 1459 if (str[i] == c) { 1460 count++; 1461 } 1462 } 1463 return count; 1464 } 1465 1466 /** 1467 * Internal function that returns a string of the constants in the correct 1468 * format. 1469 * 1470 * Example: 1471 * 1000 --> "-per-1000" 1472 * 1000000 --> "-per-1e6" 1473 * 1474 * NOTE: this function is only used when the constant denominator is greater 1475 * than 0. 1476 */ 1477 CharString getConstantsString(uint64_t constantDenominator, UErrorCode &status) { 1478 U_ASSERT(constantDenominator > 0 && constantDenominator <= LLONG_MAX); 1479 1480 CharString result; 1481 result.appendNumber(constantDenominator, status); 1482 if (U_FAILURE(status)) { 1483 return result; 1484 } 1485 1486 if (constantDenominator <= 1000) { 1487 return result; 1488 } 1489 1490 // Check if the constant is a power of 10. 1491 int32_t zeros = countCharacter(result, '0'); 1492 if (zeros == result.length() - 1 && result[0] == '1') { 1493 result.clear(); 1494 result.append(StringPiece("1e"), status); 1495 result.appendNumber(zeros, status); 1496 } 1497 1498 return result; 1499 } 1500 1501 /** 1502 * Normalize a MeasureUnitImpl and generate the identifier string in place. 1503 */ 1504 void MeasureUnitImpl::serialize(UErrorCode &status) { 1505 if (U_FAILURE(status)) { 1506 return; 1507 } 1508 1509 if (this->singleUnits.length() == 0 && this->constantDenominator == 0) { 1510 // Dimensionless, constructed by the default constructor. 1511 return; 1512 } 1513 1514 if (this->complexity == UMEASURE_UNIT_COMPOUND) { 1515 // Note: don't sort a MIXED unit 1516 uprv_sortArray(this->singleUnits.getAlias(), this->singleUnits.length(), 1517 sizeof(this->singleUnits[0]), compareSingleUnits, nullptr, false, &status); 1518 if (U_FAILURE(status)) { 1519 return; 1520 } 1521 } 1522 1523 CharString result; 1524 bool beforePer = true; 1525 bool firstTimeNegativeDimension = false; 1526 bool constantDenominatorAppended = false; 1527 for (int32_t i = 0; i < this->singleUnits.length(); i++) { 1528 if (beforePer && (*this->singleUnits[i]).dimensionality < 0) { 1529 beforePer = false; 1530 firstTimeNegativeDimension = true; 1531 } else if ((*this->singleUnits[i]).dimensionality < 0) { 1532 firstTimeNegativeDimension = false; 1533 } 1534 1535 if (U_FAILURE(status)) { 1536 return; 1537 } 1538 1539 if (this->complexity == UMeasureUnitComplexity::UMEASURE_UNIT_MIXED) { 1540 if (result.length() != 0) { 1541 result.append(StringPiece("-and-"), status); 1542 } 1543 } else { 1544 if (firstTimeNegativeDimension) { 1545 if (result.length() == 0) { 1546 result.append(StringPiece("per-"), status); 1547 } else { 1548 result.append(StringPiece("-per-"), status); 1549 } 1550 1551 if (this->constantDenominator > 0) { 1552 result.append(getConstantsString(this->constantDenominator, status), status); 1553 result.append(StringPiece("-"), status); 1554 constantDenominatorAppended = true; 1555 } 1556 1557 } else if (result.length() != 0) { 1558 result.append(StringPiece("-"), status); 1559 } 1560 } 1561 1562 this->singleUnits[i]->appendNeutralIdentifier(result, status); 1563 } 1564 1565 if (!constantDenominatorAppended && this->constantDenominator > 0) { 1566 result.append(StringPiece("-per-"), status); 1567 result.append(getConstantsString(this->constantDenominator, status), status); 1568 } 1569 1570 if (U_FAILURE(status)) { 1571 return; 1572 } 1573 this->identifier = result.toStringPiece(); 1574 if (this->identifier.isEmpty() != result.isEmpty()) { 1575 status = U_MEMORY_ALLOCATION_ERROR; 1576 } 1577 } 1578 1579 MeasureUnit MeasureUnitImpl::build(UErrorCode &status) && { 1580 this->serialize(status); 1581 return MeasureUnit(std::move(*this)); 1582 } 1583 1584 MeasureUnit MeasureUnit::forIdentifier(StringPiece identifier, UErrorCode &status) { 1585 return Parser::from(identifier, status).parse(status).build(status); 1586 } 1587 1588 UMeasureUnitComplexity MeasureUnit::getComplexity(UErrorCode &status) const { 1589 MeasureUnitImpl temp; 1590 return MeasureUnitImpl::forMeasureUnit(*this, temp, status).complexity; 1591 } 1592 1593 UMeasurePrefix MeasureUnit::getPrefix(UErrorCode &status) const { 1594 return SingleUnitImpl::forMeasureUnit(*this, status).unitPrefix; 1595 } 1596 1597 MeasureUnit MeasureUnit::withPrefix(UMeasurePrefix prefix, 1598 UErrorCode &status) const UPRV_NO_SANITIZE_UNDEFINED { 1599 SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); 1600 singleUnit.unitPrefix = prefix; 1601 return singleUnit.build(status); 1602 } 1603 1604 uint64_t MeasureUnit::getConstantDenominator(UErrorCode &status) const { 1605 // TODO(ICU-23219) 1606 auto measureUnitImpl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); 1607 if (U_FAILURE(status)) { 1608 return 0; 1609 } 1610 1611 auto complexity = measureUnitImpl.complexity; 1612 1613 if (complexity != UMEASURE_UNIT_SINGLE && complexity != UMEASURE_UNIT_COMPOUND) { 1614 status = U_ILLEGAL_ARGUMENT_ERROR; 1615 return 0; 1616 } 1617 1618 1619 return measureUnitImpl.constantDenominator; 1620 } 1621 1622 MeasureUnit MeasureUnit::withConstantDenominator(uint64_t denominator, UErrorCode &status) const { 1623 // To match the behavior of the Java API, we do not allow a constant denominator 1624 // bigger than LONG_MAX. 1625 if (denominator > LONG_MAX) { 1626 status = U_ILLEGAL_ARGUMENT_ERROR; 1627 return {}; 1628 } 1629 1630 auto complexity = this->getComplexity(status); 1631 if (U_FAILURE(status)) { 1632 return {}; 1633 } 1634 if (complexity != UMEASURE_UNIT_SINGLE && complexity != UMEASURE_UNIT_COMPOUND) { 1635 status = U_ILLEGAL_ARGUMENT_ERROR; 1636 return {}; 1637 } 1638 1639 MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); 1640 if (U_FAILURE(status)) { 1641 return {}; 1642 } 1643 1644 impl.constantDenominator = denominator; 1645 impl.complexity = (impl.singleUnits.length() < 2 && denominator == 0) ? UMEASURE_UNIT_SINGLE 1646 : UMEASURE_UNIT_COMPOUND; 1647 return std::move(impl).build(status); 1648 } 1649 1650 int32_t MeasureUnit::getDimensionality(UErrorCode& status) const { 1651 SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); 1652 if (U_FAILURE(status)) { return 0; } 1653 if (singleUnit.isDimensionless()) { 1654 return 0; 1655 } 1656 return singleUnit.dimensionality; 1657 } 1658 1659 MeasureUnit MeasureUnit::withDimensionality(int32_t dimensionality, UErrorCode& status) const { 1660 SingleUnitImpl singleUnit = SingleUnitImpl::forMeasureUnit(*this, status); 1661 singleUnit.dimensionality = dimensionality; 1662 return singleUnit.build(status); 1663 } 1664 1665 MeasureUnit MeasureUnit::reciprocal(UErrorCode& status) const { 1666 MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); 1667 // The reciprocal of a unit that has a constant denominator is not allowed. 1668 if (impl.constantDenominator != 0) { 1669 status = U_ILLEGAL_ARGUMENT_ERROR; 1670 return {}; 1671 } 1672 impl.takeReciprocal(status); 1673 return std::move(impl).build(status); 1674 } 1675 1676 MeasureUnit MeasureUnit::product(const MeasureUnit& other, UErrorCode& status) const { 1677 MeasureUnitImpl impl = MeasureUnitImpl::forMeasureUnitMaybeCopy(*this, status); 1678 MeasureUnitImpl temp; 1679 const MeasureUnitImpl& otherImpl = MeasureUnitImpl::forMeasureUnit(other, temp, status); 1680 if (impl.complexity == UMEASURE_UNIT_MIXED || otherImpl.complexity == UMEASURE_UNIT_MIXED) { 1681 status = U_ILLEGAL_ARGUMENT_ERROR; 1682 return {}; 1683 } 1684 for (int32_t i = 0; i < otherImpl.singleUnits.length(); i++) { 1685 impl.appendSingleUnit(*otherImpl.singleUnits[i], status); 1686 } 1687 1688 uint64_t currentConstatDenominator = impl.constantDenominator; 1689 uint64_t otherConstantDenominator = otherImpl.constantDenominator; 1690 1691 // TODO: we can also multiply the constant denominators instead of returning an error. 1692 if (currentConstatDenominator != 0 && otherConstantDenominator != 0) { 1693 // There is only `one` constant denominator in a compound unit. 1694 // Therefore, we Cannot multiply units that both of them have a constant denominator 1695 status = U_ILLEGAL_ARGUMENT_ERROR; 1696 return {}; 1697 } 1698 1699 // Because either one of the constant denominators is zero, we can use the maximum of them. 1700 impl.constantDenominator = uprv_max(currentConstatDenominator, otherConstantDenominator); 1701 1702 if (impl.singleUnits.length() > 1 || impl.constantDenominator > 0) { 1703 impl.complexity = UMEASURE_UNIT_COMPOUND; 1704 } 1705 1706 return std::move(impl).build(status); 1707 } 1708 1709 LocalArray<MeasureUnit> MeasureUnit::splitToSingleUnitsImpl(int32_t& outCount, UErrorCode& status) const { 1710 MeasureUnitImpl temp; 1711 const MeasureUnitImpl& impl = MeasureUnitImpl::forMeasureUnit(*this, temp, status); 1712 outCount = impl.singleUnits.length(); 1713 MeasureUnit* arr = new MeasureUnit[outCount]; 1714 if (arr == nullptr) { 1715 status = U_MEMORY_ALLOCATION_ERROR; 1716 return LocalArray<MeasureUnit>(); 1717 } 1718 for (int32_t i = 0; i < outCount; i++) { 1719 arr[i] = impl.singleUnits[i]->build(status); 1720 } 1721 return LocalArray<MeasureUnit>(arr, status); 1722 } 1723 1724 1725 U_NAMESPACE_END 1726 1727 #endif /* !UNCONFIG_NO_FORMATTING */