tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

NumberFormatFields.cpp (13672B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 #include "ICU4CGlue.h"
      5 #include "NumberFormatFields.h"
      6 #include "ScopedICUObject.h"
      7 
      8 #include "unicode/uformattedvalue.h"
      9 #include "unicode/unum.h"
     10 #include "unicode/unumberformatter.h"
     11 
     12 namespace mozilla::intl {
     13 
     14 bool NumberFormatFields::append(NumberPartType type, int32_t begin,
     15                                int32_t end) {
     16  MOZ_ASSERT(begin >= 0);
     17  MOZ_ASSERT(end >= 0);
     18  MOZ_ASSERT(begin < end, "erm, aren't fields always non-empty?");
     19 
     20  return fields_.emplaceBack(uint32_t(begin), uint32_t(end), type);
     21 }
     22 
     23 bool NumberFormatFields::toPartsVector(size_t overallLength,
     24                                       const NumberPartSourceMap& sourceMap,
     25                                       NumberPartVector& parts) {
     26  std::sort(fields_.begin(), fields_.end(),
     27            [](const NumberFormatField& left, const NumberFormatField& right) {
     28              // Sort first by begin index, then to place
     29              // enclosing fields before nested fields.
     30              return left.begin < right.begin ||
     31                     (left.begin == right.begin && left.end > right.end);
     32            });
     33 
     34  // Then iterate over the sorted field list to generate a sequence of parts
     35  // (what ECMA-402 actually exposes).  A part is a maximal character sequence
     36  // entirely within no field or a single most-nested field.
     37  //
     38  // Diagrams may be helpful to illustrate how fields map to parts.  Consider
     39  // formatting -19,766,580,028,249.41, the US national surplus (negative
     40  // because it's actually a debt) on October 18, 2016.
     41  //
     42  //    var options =
     43  //      { style: "currency", currency: "USD", currencyDisplay: "name" };
     44  //    var usdFormatter = new Intl.NumberFormat("en-US", options);
     45  //    usdFormatter.format(-19766580028249.41);
     46  //
     47  // The formatted result is "-19,766,580,028,249.41 US dollars".  ICU
     48  // identifies these fields in the string:
     49  //
     50  //     UNUM_GROUPING_SEPARATOR_FIELD
     51  //                   |
     52  //   UNUM_SIGN_FIELD |  UNUM_DECIMAL_SEPARATOR_FIELD
     53  //    |   __________/|   |
     54  //    |  /   |   |   |   |
     55  //   "-19,766,580,028,249.41 US dollars"
     56  //     \________________/ |/ \_______/
     57  //             |          |      |
     58  //    UNUM_INTEGER_FIELD  |  UNUM_CURRENCY_FIELD
     59  //                        |
     60  //               UNUM_FRACTION_FIELD
     61  //
     62  // These fields map to parts as follows:
     63  //
     64  //         integer     decimal
     65  //       _____|________  |
     66  //      /  /| |\  |\  |\ |  literal
     67  //     /| / | | \ | \ | \|  |
     68  //   "-19,766,580,028,249.41 US dollars"
     69  //    |  \___|___|___/    |/ \________/
     70  //    |        |          |       |
     71  //    |      group        |   currency
     72  //    |                   |
     73  //   minusSign        fraction
     74  //
     75  // The sign is a part.  Each comma is a part, splitting the integer field
     76  // into parts for trillions/billions/&c. digits.  The decimal point is a
     77  // part.  Cents are a part.  The space between cents and currency is a part
     78  // (outside any field).  Last, the currency field is a part.
     79 
     80  class PartGenerator {
     81    // The fields in order from start to end, then least to most nested.
     82    const FieldsVector& fields;
     83 
     84    // Index of the current field, in |fields|, being considered to
     85    // determine part boundaries.  |lastEnd <= fields[index].begin| is an
     86    // invariant.
     87    size_t index = 0;
     88 
     89    // The end index of the last part produced, always less than or equal
     90    // to |limit|, strictly increasing.
     91    uint32_t lastEnd = 0;
     92 
     93    // The length of the overall formatted string.
     94    const uint32_t limit = 0;
     95 
     96    NumberPartSourceMap sourceMap;
     97 
     98    Vector<size_t, 4> enclosingFields;
     99 
    100    void popEnclosingFieldsEndingAt(uint32_t end) {
    101      MOZ_ASSERT_IF(enclosingFields.length() > 0,
    102                    fields[enclosingFields.back()].end >= end);
    103 
    104      while (enclosingFields.length() > 0 &&
    105             fields[enclosingFields.back()].end == end) {
    106        enclosingFields.popBack();
    107      }
    108    }
    109 
    110    bool nextPartInternal(NumberPart* part) {
    111      size_t len = fields.length();
    112      MOZ_ASSERT(index <= len);
    113 
    114      // If we're out of fields, all that remains are part(s) consisting
    115      // of trailing portions of enclosing fields, and maybe a final
    116      // literal part.
    117      if (index == len) {
    118        if (enclosingFields.length() > 0) {
    119          const auto& enclosing = fields[enclosingFields.popCopy()];
    120          *part = {enclosing.type, sourceMap.source(enclosing), enclosing.end};
    121 
    122          // If additional enclosing fields end where this part ends,
    123          // pop them as well.
    124          popEnclosingFieldsEndingAt(part->endIndex);
    125        } else {
    126          *part = {NumberPartType::Literal, sourceMap.source(limit), limit};
    127        }
    128 
    129        return true;
    130      }
    131 
    132      // Otherwise we still have a field to process.
    133      const NumberFormatField* current = &fields[index];
    134      MOZ_ASSERT(lastEnd <= current->begin);
    135      MOZ_ASSERT(current->begin < current->end);
    136 
    137      // But first, deal with inter-field space.
    138      if (lastEnd < current->begin) {
    139        if (enclosingFields.length() > 0) {
    140          // Space between fields, within an enclosing field, is part
    141          // of that enclosing field, until the start of the current
    142          // field or the end of the enclosing field, whichever is
    143          // earlier.
    144          const auto& enclosing = fields[enclosingFields.back()];
    145          *part = {enclosing.type, sourceMap.source(enclosing),
    146                   std::min(enclosing.end, current->begin)};
    147          popEnclosingFieldsEndingAt(part->endIndex);
    148        } else {
    149          // If there's no enclosing field, the space is a literal.
    150          *part = {NumberPartType::Literal, sourceMap.source(current->begin),
    151                   current->begin};
    152        }
    153 
    154        return true;
    155      }
    156 
    157      // Otherwise, the part spans a prefix of the current field.  Find
    158      // the most-nested field containing that prefix.
    159      const NumberFormatField* next;
    160      do {
    161        current = &fields[index];
    162 
    163        // If the current field is last, the part extends to its end.
    164        if (++index == len) {
    165          *part = {current->type, sourceMap.source(*current), current->end};
    166          return true;
    167        }
    168 
    169        next = &fields[index];
    170        MOZ_ASSERT(current->begin <= next->begin);
    171        MOZ_ASSERT(current->begin < next->end);
    172 
    173        // If the next field nests within the current field, push an
    174        // enclosing field.  (If there are no nested fields, don't
    175        // bother pushing a field that'd be immediately popped.)
    176        if (current->end > next->begin) {
    177          if (!enclosingFields.append(index - 1)) {
    178            return false;
    179          }
    180        }
    181 
    182        // Do so until the next field begins after this one.
    183      } while (current->begin == next->begin);
    184 
    185      if (current->end <= next->begin) {
    186        // The next field begins after the current field ends.  Therefore
    187        // the current part ends at the end of the current field.
    188        *part = {current->type, sourceMap.source(*current), current->end};
    189        popEnclosingFieldsEndingAt(part->endIndex);
    190      } else {
    191        // The current field encloses the next one.  The current part
    192        // ends where the next field/part will start.
    193        *part = {current->type, sourceMap.source(*current), next->begin};
    194      }
    195 
    196      return true;
    197    }
    198 
    199   public:
    200    PartGenerator(const FieldsVector& vec, uint32_t limit,
    201                  const NumberPartSourceMap& sourceMap)
    202        : fields(vec), limit(limit), sourceMap(sourceMap) {}
    203 
    204    bool nextPart(bool* hasPart, NumberPart* part) {
    205      // There are no parts left if we've partitioned the entire string.
    206      if (lastEnd == limit) {
    207        MOZ_ASSERT(enclosingFields.length() == 0);
    208        *hasPart = false;
    209        return true;
    210      }
    211 
    212      if (!nextPartInternal(part)) {
    213        return false;
    214      }
    215 
    216      *hasPart = true;
    217      lastEnd = part->endIndex;
    218      return true;
    219    }
    220  };
    221 
    222  // Finally, generate the result array.
    223  size_t lastEndIndex = 0;
    224 
    225  PartGenerator gen(fields_, overallLength, sourceMap);
    226  do {
    227    bool hasPart;
    228    NumberPart part;
    229    if (!gen.nextPart(&hasPart, &part)) {
    230      return false;
    231    }
    232 
    233    if (!hasPart) {
    234      break;
    235    }
    236 
    237    MOZ_ASSERT(lastEndIndex < part.endIndex);
    238 
    239    if (!parts.append(part)) {
    240      return false;
    241    }
    242 
    243    lastEndIndex = part.endIndex;
    244  } while (true);
    245 
    246  MOZ_ASSERT(lastEndIndex == overallLength,
    247             "result array must partition the entire string");
    248 
    249  return lastEndIndex == overallLength;
    250 }
    251 
    252 Result<std::u16string_view, ICUError> FormatResultToParts(
    253    const UFormattedNumber* value, Maybe<double> number, bool isNegative,
    254    bool formatForUnit, NumberPartVector& parts) {
    255  UErrorCode status = U_ZERO_ERROR;
    256 
    257  const UFormattedValue* formattedValue = unumf_resultAsValue(value, &status);
    258  if (U_FAILURE(status)) {
    259    return Err(ToICUError(status));
    260  }
    261 
    262  return FormatResultToParts(formattedValue, number, isNegative, formatForUnit,
    263                             parts);
    264 }
    265 
    266 Result<std::u16string_view, ICUError> FormatResultToParts(
    267    const UFormattedValue* value, Maybe<double> number, bool isNegative,
    268    bool formatForUnit, NumberPartVector& parts) {
    269  UErrorCode status = U_ZERO_ERROR;
    270 
    271  int32_t utf16Length;
    272  const char16_t* utf16Str = ufmtval_getString(value, &utf16Length, &status);
    273  if (U_FAILURE(status)) {
    274    return Err(ToICUError(status));
    275  }
    276 
    277  UConstrainedFieldPosition* fpos = ucfpos_open(&status);
    278  if (U_FAILURE(status)) {
    279    return Err(ToICUError(status));
    280  }
    281  ScopedICUObject<UConstrainedFieldPosition, ucfpos_close> toCloseFpos(fpos);
    282 
    283  // We're only interested in UFIELD_CATEGORY_NUMBER fields.
    284  ucfpos_constrainCategory(fpos, UFIELD_CATEGORY_NUMBER, &status);
    285  if (U_FAILURE(status)) {
    286    return Err(ToICUError(status));
    287  }
    288 
    289  // Vacuum up fields in the overall formatted string.
    290  NumberFormatFields fields;
    291 
    292  while (true) {
    293    bool hasMore = ufmtval_nextPosition(value, fpos, &status);
    294    if (U_FAILURE(status)) {
    295      return Err(ToICUError(status));
    296    }
    297    if (!hasMore) {
    298      break;
    299    }
    300 
    301    int32_t fieldName = ucfpos_getField(fpos, &status);
    302    if (U_FAILURE(status)) {
    303      return Err(ToICUError(status));
    304    }
    305 
    306    int32_t beginIndex, endIndex;
    307    ucfpos_getIndexes(fpos, &beginIndex, &endIndex, &status);
    308    if (U_FAILURE(status)) {
    309      return Err(ToICUError(status));
    310    }
    311 
    312    Maybe<NumberPartType> partType = GetPartTypeForNumberField(
    313        UNumberFormatFields(fieldName), number, isNegative, formatForUnit);
    314    if (!partType || !fields.append(*partType, beginIndex, endIndex)) {
    315      return Err(ICUError::InternalError);
    316    }
    317  }
    318 
    319  if (!fields.toPartsVector(utf16Length, parts)) {
    320    return Err(ICUError::InternalError);
    321  }
    322 
    323  return std::u16string_view(utf16Str, static_cast<size_t>(utf16Length));
    324 }
    325 
    326 // See intl/icu/source/i18n/unicode/unum.h for a detailed field list.  This
    327 // list is deliberately exhaustive: cases might have to be added/removed if
    328 // this code is compiled with a different ICU with more UNumberFormatFields
    329 // enum initializers.  Please guard such cases with appropriate ICU
    330 // version-testing #ifdefs, should cross-version divergence occur.
    331 Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName,
    332                                                Maybe<double> number,
    333                                                bool isNegative,
    334                                                bool formatForUnit) {
    335  switch (fieldName) {
    336    case UNUM_INTEGER_FIELD:
    337      if (number.isSome()) {
    338        if (std::isnan(*number)) {
    339          return Some(NumberPartType::Nan);
    340        }
    341        if (!std::isfinite(*number)) {
    342          return Some(NumberPartType::Infinity);
    343        }
    344      }
    345      return Some(NumberPartType::Integer);
    346    case UNUM_FRACTION_FIELD:
    347      return Some(NumberPartType::Fraction);
    348    case UNUM_DECIMAL_SEPARATOR_FIELD:
    349      return Some(NumberPartType::Decimal);
    350    case UNUM_EXPONENT_SYMBOL_FIELD:
    351      return Some(NumberPartType::ExponentSeparator);
    352    case UNUM_EXPONENT_SIGN_FIELD:
    353      return Some(NumberPartType::ExponentMinusSign);
    354    case UNUM_EXPONENT_FIELD:
    355      return Some(NumberPartType::ExponentInteger);
    356    case UNUM_GROUPING_SEPARATOR_FIELD:
    357      return Some(NumberPartType::Group);
    358    case UNUM_CURRENCY_FIELD:
    359      return Some(NumberPartType::Currency);
    360    case UNUM_PERCENT_FIELD:
    361      if (formatForUnit) {
    362        return Some(NumberPartType::Unit);
    363      }
    364      return Some(NumberPartType::Percent);
    365    case UNUM_PERMILL_FIELD:
    366      MOZ_ASSERT_UNREACHABLE(
    367          "unexpected permill field found, even though "
    368          "we don't use any user-defined patterns that "
    369          "would require a permill field");
    370      break;
    371    case UNUM_SIGN_FIELD:
    372      if (isNegative) {
    373        return Some(NumberPartType::MinusSign);
    374      }
    375      return Some(NumberPartType::PlusSign);
    376    case UNUM_MEASURE_UNIT_FIELD:
    377      return Some(NumberPartType::Unit);
    378    case UNUM_COMPACT_FIELD:
    379      return Some(NumberPartType::Compact);
    380    case UNUM_APPROXIMATELY_SIGN_FIELD:
    381      return Some(NumberPartType::ApproximatelySign);
    382 #ifndef U_HIDE_DEPRECATED_API
    383    case UNUM_FIELD_COUNT:
    384      MOZ_ASSERT_UNREACHABLE(
    385          "format field sentinel value returned by iterator!");
    386      break;
    387 #endif
    388  }
    389 
    390  MOZ_ASSERT_UNREACHABLE(
    391      "unenumerated, undocumented format field returned by iterator");
    392  return Nothing();
    393 }
    394 
    395 }  // namespace mozilla::intl