NumberFormatFields.cpp (13672B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 #include "ICU4CGlue.h" 5 #include "NumberFormatFields.h" 6 #include "ScopedICUObject.h" 7 8 #include "unicode/uformattedvalue.h" 9 #include "unicode/unum.h" 10 #include "unicode/unumberformatter.h" 11 12 namespace mozilla::intl { 13 14 bool NumberFormatFields::append(NumberPartType type, int32_t begin, 15 int32_t end) { 16 MOZ_ASSERT(begin >= 0); 17 MOZ_ASSERT(end >= 0); 18 MOZ_ASSERT(begin < end, "erm, aren't fields always non-empty?"); 19 20 return fields_.emplaceBack(uint32_t(begin), uint32_t(end), type); 21 } 22 23 bool NumberFormatFields::toPartsVector(size_t overallLength, 24 const NumberPartSourceMap& sourceMap, 25 NumberPartVector& parts) { 26 std::sort(fields_.begin(), fields_.end(), 27 [](const NumberFormatField& left, const NumberFormatField& right) { 28 // Sort first by begin index, then to place 29 // enclosing fields before nested fields. 30 return left.begin < right.begin || 31 (left.begin == right.begin && left.end > right.end); 32 }); 33 34 // Then iterate over the sorted field list to generate a sequence of parts 35 // (what ECMA-402 actually exposes). A part is a maximal character sequence 36 // entirely within no field or a single most-nested field. 37 // 38 // Diagrams may be helpful to illustrate how fields map to parts. Consider 39 // formatting -19,766,580,028,249.41, the US national surplus (negative 40 // because it's actually a debt) on October 18, 2016. 41 // 42 // var options = 43 // { style: "currency", currency: "USD", currencyDisplay: "name" }; 44 // var usdFormatter = new Intl.NumberFormat("en-US", options); 45 // usdFormatter.format(-19766580028249.41); 46 // 47 // The formatted result is "-19,766,580,028,249.41 US dollars". ICU 48 // identifies these fields in the string: 49 // 50 // UNUM_GROUPING_SEPARATOR_FIELD 51 // | 52 // UNUM_SIGN_FIELD | UNUM_DECIMAL_SEPARATOR_FIELD 53 // | __________/| | 54 // | / | | | | 55 // "-19,766,580,028,249.41 US dollars" 56 // \________________/ |/ \_______/ 57 // | | | 58 // UNUM_INTEGER_FIELD | UNUM_CURRENCY_FIELD 59 // | 60 // UNUM_FRACTION_FIELD 61 // 62 // These fields map to parts as follows: 63 // 64 // integer decimal 65 // _____|________ | 66 // / /| |\ |\ |\ | literal 67 // /| / | | \ | \ | \| | 68 // "-19,766,580,028,249.41 US dollars" 69 // | \___|___|___/ |/ \________/ 70 // | | | | 71 // | group | currency 72 // | | 73 // minusSign fraction 74 // 75 // The sign is a part. Each comma is a part, splitting the integer field 76 // into parts for trillions/billions/&c. digits. The decimal point is a 77 // part. Cents are a part. The space between cents and currency is a part 78 // (outside any field). Last, the currency field is a part. 79 80 class PartGenerator { 81 // The fields in order from start to end, then least to most nested. 82 const FieldsVector& fields; 83 84 // Index of the current field, in |fields|, being considered to 85 // determine part boundaries. |lastEnd <= fields[index].begin| is an 86 // invariant. 87 size_t index = 0; 88 89 // The end index of the last part produced, always less than or equal 90 // to |limit|, strictly increasing. 91 uint32_t lastEnd = 0; 92 93 // The length of the overall formatted string. 94 const uint32_t limit = 0; 95 96 NumberPartSourceMap sourceMap; 97 98 Vector<size_t, 4> enclosingFields; 99 100 void popEnclosingFieldsEndingAt(uint32_t end) { 101 MOZ_ASSERT_IF(enclosingFields.length() > 0, 102 fields[enclosingFields.back()].end >= end); 103 104 while (enclosingFields.length() > 0 && 105 fields[enclosingFields.back()].end == end) { 106 enclosingFields.popBack(); 107 } 108 } 109 110 bool nextPartInternal(NumberPart* part) { 111 size_t len = fields.length(); 112 MOZ_ASSERT(index <= len); 113 114 // If we're out of fields, all that remains are part(s) consisting 115 // of trailing portions of enclosing fields, and maybe a final 116 // literal part. 117 if (index == len) { 118 if (enclosingFields.length() > 0) { 119 const auto& enclosing = fields[enclosingFields.popCopy()]; 120 *part = {enclosing.type, sourceMap.source(enclosing), enclosing.end}; 121 122 // If additional enclosing fields end where this part ends, 123 // pop them as well. 124 popEnclosingFieldsEndingAt(part->endIndex); 125 } else { 126 *part = {NumberPartType::Literal, sourceMap.source(limit), limit}; 127 } 128 129 return true; 130 } 131 132 // Otherwise we still have a field to process. 133 const NumberFormatField* current = &fields[index]; 134 MOZ_ASSERT(lastEnd <= current->begin); 135 MOZ_ASSERT(current->begin < current->end); 136 137 // But first, deal with inter-field space. 138 if (lastEnd < current->begin) { 139 if (enclosingFields.length() > 0) { 140 // Space between fields, within an enclosing field, is part 141 // of that enclosing field, until the start of the current 142 // field or the end of the enclosing field, whichever is 143 // earlier. 144 const auto& enclosing = fields[enclosingFields.back()]; 145 *part = {enclosing.type, sourceMap.source(enclosing), 146 std::min(enclosing.end, current->begin)}; 147 popEnclosingFieldsEndingAt(part->endIndex); 148 } else { 149 // If there's no enclosing field, the space is a literal. 150 *part = {NumberPartType::Literal, sourceMap.source(current->begin), 151 current->begin}; 152 } 153 154 return true; 155 } 156 157 // Otherwise, the part spans a prefix of the current field. Find 158 // the most-nested field containing that prefix. 159 const NumberFormatField* next; 160 do { 161 current = &fields[index]; 162 163 // If the current field is last, the part extends to its end. 164 if (++index == len) { 165 *part = {current->type, sourceMap.source(*current), current->end}; 166 return true; 167 } 168 169 next = &fields[index]; 170 MOZ_ASSERT(current->begin <= next->begin); 171 MOZ_ASSERT(current->begin < next->end); 172 173 // If the next field nests within the current field, push an 174 // enclosing field. (If there are no nested fields, don't 175 // bother pushing a field that'd be immediately popped.) 176 if (current->end > next->begin) { 177 if (!enclosingFields.append(index - 1)) { 178 return false; 179 } 180 } 181 182 // Do so until the next field begins after this one. 183 } while (current->begin == next->begin); 184 185 if (current->end <= next->begin) { 186 // The next field begins after the current field ends. Therefore 187 // the current part ends at the end of the current field. 188 *part = {current->type, sourceMap.source(*current), current->end}; 189 popEnclosingFieldsEndingAt(part->endIndex); 190 } else { 191 // The current field encloses the next one. The current part 192 // ends where the next field/part will start. 193 *part = {current->type, sourceMap.source(*current), next->begin}; 194 } 195 196 return true; 197 } 198 199 public: 200 PartGenerator(const FieldsVector& vec, uint32_t limit, 201 const NumberPartSourceMap& sourceMap) 202 : fields(vec), limit(limit), sourceMap(sourceMap) {} 203 204 bool nextPart(bool* hasPart, NumberPart* part) { 205 // There are no parts left if we've partitioned the entire string. 206 if (lastEnd == limit) { 207 MOZ_ASSERT(enclosingFields.length() == 0); 208 *hasPart = false; 209 return true; 210 } 211 212 if (!nextPartInternal(part)) { 213 return false; 214 } 215 216 *hasPart = true; 217 lastEnd = part->endIndex; 218 return true; 219 } 220 }; 221 222 // Finally, generate the result array. 223 size_t lastEndIndex = 0; 224 225 PartGenerator gen(fields_, overallLength, sourceMap); 226 do { 227 bool hasPart; 228 NumberPart part; 229 if (!gen.nextPart(&hasPart, &part)) { 230 return false; 231 } 232 233 if (!hasPart) { 234 break; 235 } 236 237 MOZ_ASSERT(lastEndIndex < part.endIndex); 238 239 if (!parts.append(part)) { 240 return false; 241 } 242 243 lastEndIndex = part.endIndex; 244 } while (true); 245 246 MOZ_ASSERT(lastEndIndex == overallLength, 247 "result array must partition the entire string"); 248 249 return lastEndIndex == overallLength; 250 } 251 252 Result<std::u16string_view, ICUError> FormatResultToParts( 253 const UFormattedNumber* value, Maybe<double> number, bool isNegative, 254 bool formatForUnit, NumberPartVector& parts) { 255 UErrorCode status = U_ZERO_ERROR; 256 257 const UFormattedValue* formattedValue = unumf_resultAsValue(value, &status); 258 if (U_FAILURE(status)) { 259 return Err(ToICUError(status)); 260 } 261 262 return FormatResultToParts(formattedValue, number, isNegative, formatForUnit, 263 parts); 264 } 265 266 Result<std::u16string_view, ICUError> FormatResultToParts( 267 const UFormattedValue* value, Maybe<double> number, bool isNegative, 268 bool formatForUnit, NumberPartVector& parts) { 269 UErrorCode status = U_ZERO_ERROR; 270 271 int32_t utf16Length; 272 const char16_t* utf16Str = ufmtval_getString(value, &utf16Length, &status); 273 if (U_FAILURE(status)) { 274 return Err(ToICUError(status)); 275 } 276 277 UConstrainedFieldPosition* fpos = ucfpos_open(&status); 278 if (U_FAILURE(status)) { 279 return Err(ToICUError(status)); 280 } 281 ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos); 282 283 // We're only interested in UFIELD_CATEGORY_NUMBER fields. 284 ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_NUMBER, &status); 285 if (U_FAILURE(status)) { 286 return Err(ToICUError(status)); 287 } 288 289 // Vacuum up fields in the overall formatted string. 290 NumberFormatFields fields; 291 292 while (true) { 293 bool hasMore = ufmtval_nextPosition(value, fpos, &status); 294 if (U_FAILURE(status)) { 295 return Err(ToICUError(status)); 296 } 297 if (!hasMore) { 298 break; 299 } 300 301 int32_t fieldName = ucfpos_getField(fpos, &status); 302 if (U_FAILURE(status)) { 303 return Err(ToICUError(status)); 304 } 305 306 int32_t beginIndex, endIndex; 307 ucfpos_getIndexes(fpos, &beginIndex, &endIndex, &status); 308 if (U_FAILURE(status)) { 309 return Err(ToICUError(status)); 310 } 311 312 Maybe<NumberPartType> partType = GetPartTypeForNumberField( 313 UNumberFormatFields(fieldName), number, isNegative, formatForUnit); 314 if (!partType || !fields.append(*partType, beginIndex, endIndex)) { 315 return Err(ICUError::InternalError); 316 } 317 } 318 319 if (!fields.toPartsVector(utf16Length, parts)) { 320 return Err(ICUError::InternalError); 321 } 322 323 return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length)); 324 } 325 326 // See intl/icu/source/i18n/unicode/unum.h for a detailed field list. This 327 // list is deliberately exhaustive: cases might have to be added/removed if 328 // this code is compiled with a different ICU with more UNumberFormatFields 329 // enum initializers. Please guard such cases with appropriate ICU 330 // version-testing #ifdefs, should cross-version divergence occur. 331 Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName, 332 Maybe<double> number, 333 bool isNegative, 334 bool formatForUnit) { 335 switch (fieldName) { 336 case UNUM_INTEGER_FIELD: 337 if (number.isSome()) { 338 if (std::isnan(*number)) { 339 return Some(NumberPartType::Nan); 340 } 341 if (!std::isfinite(*number)) { 342 return Some(NumberPartType::Infinity); 343 } 344 } 345 return Some(NumberPartType::Integer); 346 case UNUM_FRACTION_FIELD: 347 return Some(NumberPartType::Fraction); 348 case UNUM_DECIMAL_SEPARATOR_FIELD: 349 return Some(NumberPartType::Decimal); 350 case UNUM_EXPONENT_SYMBOL_FIELD: 351 return Some(NumberPartType::ExponentSeparator); 352 case UNUM_EXPONENT_SIGN_FIELD: 353 return Some(NumberPartType::ExponentMinusSign); 354 case UNUM_EXPONENT_FIELD: 355 return Some(NumberPartType::ExponentInteger); 356 case UNUM_GROUPING_SEPARATOR_FIELD: 357 return Some(NumberPartType::Group); 358 case UNUM_CURRENCY_FIELD: 359 return Some(NumberPartType::Currency); 360 case UNUM_PERCENT_FIELD: 361 if (formatForUnit) { 362 return Some(NumberPartType::Unit); 363 } 364 return Some(NumberPartType::Percent); 365 case UNUM_PERMILL_FIELD: 366 MOZ_ASSERT_UNREACHABLE( 367 "unexpected permill field found, even though " 368 "we don't use any user-defined patterns that " 369 "would require a permill field"); 370 break; 371 case UNUM_SIGN_FIELD: 372 if (isNegative) { 373 return Some(NumberPartType::MinusSign); 374 } 375 return Some(NumberPartType::PlusSign); 376 case UNUM_MEASURE_UNIT_FIELD: 377 return Some(NumberPartType::Unit); 378 case UNUM_COMPACT_FIELD: 379 return Some(NumberPartType::Compact); 380 case UNUM_APPROXIMATELY_SIGN_FIELD: 381 return Some(NumberPartType::ApproximatelySign); 382 #ifndef U_HIDE_DEPRECATED_API 383 case UNUM_FIELD_COUNT: 384 MOZ_ASSERT_UNREACHABLE( 385 "format field sentinel value returned by iterator!"); 386 break; 387 #endif 388 } 389 390 MOZ_ASSERT_UNREACHABLE( 391 "unenumerated, undocumented format field returned by iterator"); 392 return Nothing(); 393 } 394 395 } // namespace mozilla::intl