DateTimeFormat.h (22126B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 #ifndef intl_components_DateTimeFormat_h_ 5 #define intl_components_DateTimeFormat_h_ 6 #include <functional> 7 #include "unicode/udat.h" 8 9 #include "mozilla/intl/ICU4CGlue.h" 10 #include "mozilla/intl/ICUError.h" 11 12 #include "mozilla/intl/DateTimePart.h" 13 #include "mozilla/intl/DateTimePatternGenerator.h" 14 #include "mozilla/Maybe.h" 15 #include "mozilla/Span.h" 16 #include "mozilla/Try.h" 17 #include "mozilla/UniquePtr.h" 18 #include "mozilla/Vector.h" 19 20 /* 21 * To work around webcompat problems caused by Narrow No-Break Space in 22 * formatted date/time output, where existing code on the web naively 23 * assumes there will be a normal Space, we replace any occurrences of 24 * U+202F in the formatted results with U+0020. 25 * 26 * The intention is to undo this hack once other major browsers are also 27 * ready to ship with the updated (ICU72) i18n data that uses NNBSP. 28 * 29 * See https://bugzilla.mozilla.org/show_bug.cgi?id=1806042 for details, 30 * and see DateIntervalFormat.cpp for the other piece of this hack. 31 */ 32 #define DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 1 33 34 namespace mozilla::intl { 35 36 #if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 37 static inline bool IsSpecialSpace(char16_t c) { 38 // NARROW NO-BREAK SPACE and THIN SPACE 39 return c == 0x202F || c == 0x2009; 40 } 41 #endif 42 43 class Calendar; 44 45 /** 46 * Intro to mozilla::intl::DateTimeFormat 47 * ====================================== 48 * 49 * This component is a Mozilla-focused API for the date formatting provided by 50 * ICU. The methods internally call out to ICU4C. This is responsible for and 51 * owns any resources opened through ICU, through RAII. 52 * 53 * The construction of a DateTimeFormat contains the majority of the cost 54 * of the DateTimeFormat operation. DateTimeFormat::TryFormat should be 55 * relatively inexpensive after the initial construction. 56 * 57 * This class supports creating from Styles (a fixed set of options) and from a 58 * components bag (a list of components and their lengths). 59 * 60 * This API serves to back the ECMA-402 Intl.DateTimeFormat API. 61 * https://tc39.es/ecma402/#datetimeformat-objects 62 * 63 * 64 * ECMA-402 Intl.DateTimeFormat API and implementation details with ICU 65 * skeletons and patterns. 66 * ==================================================================== 67 * 68 * Different locales have different ways to display dates using the same 69 * basic components. For example, en-US might use "Sept. 24, 2012" while 70 * fr-FR might use "24 Sept. 2012". The intent of Intl.DateTimeFormat is to 71 * permit production of a format for the locale that best matches the 72 * set of date-time components and their desired representation as specified 73 * by the API client. 74 * 75 * ICU4C supports specification of date and time formats in three ways: 76 * 77 * 1) A style is just one of the identifiers FULL, LONG, MEDIUM, or SHORT. 78 * The date-time components included in each style and their representation 79 * are defined by ICU using CLDR locale data (CLDR is the Unicode 80 * Consortium's Common Locale Data Repository). 81 * 82 * 2) A skeleton is a string specifying which date-time components to include, 83 * and which representations to use for them. For example, "yyyyMMMMdd" 84 * specifies a year with at least four digits, a full month name, and a 85 * two-digit day. It does not specify in which order the components appear, 86 * how they are separated, the localized strings for textual components 87 * (such as weekday or month), whether the month is in format or 88 * stand-alone form¹, or the numbering system used for numeric components. 89 * All that information is filled in by ICU using CLDR locale data. 90 * ¹ The format form is the one used in formatted strings that include a 91 * day; the stand-alone form is used when not including days, e.g., in 92 * calendar headers. The two forms differ at least in some Slavic languages, 93 * e.g. Russian: "22 марта 2013 г." vs. "Март 2013". 94 * 95 * 3) A pattern is a string specifying which date-time components to include, 96 * in which order, with which separators, in which grammatical case. For 97 * example, "EEEE, d MMMM y" specifies the full localized weekday name, 98 * followed by comma and space, followed by the day, followed by space, 99 * followed by the full month name in format form, followed by space, 100 * followed by the full year. It 101 * still does not specify localized strings for textual components and the 102 * numbering system - these are determined by ICU using CLDR locale data or 103 * possibly API parameters. 104 * 105 * All actual formatting in ICU4C is done with patterns; styles and skeletons 106 * have to be mapped to patterns before processing. 107 * 108 * The options of Intl.DateTimeFormat most closely correspond to ICU skeletons. 109 * This implementation therefore converts DateTimeFormat options to ICU 110 * skeletons, and then lets ICU map skeletons to actual ICU patterns. The 111 * pattern may not directly correspond to what the skeleton requests, as the 112 * mapper (UDateTimePatternGenerator) is constrained by the available locale 113 * data for the locale. 114 * 115 * An ICU pattern represents the information of the following DateTimeFormat 116 * internal properties described in the specification, which therefore don't 117 * exist separately in the implementation: 118 * - [[weekday]], [[era]], [[year]], [[month]], [[day]], [[hour]], [[minute]], 119 * [[second]], [[timeZoneName]] 120 * - [[hour12]] 121 * - [[hourCycle]] 122 * - [[hourNo0]] 123 * When needed for the resolvedOptions method, the resolveICUPattern function 124 * queries the UDateFormat's internal pattern and then maps the it back to the 125 * specified properties of the object returned by resolvedOptions. 126 * 127 * ICU date-time skeletons and patterns aren't fully documented in the ICU 128 * documentation (see http://bugs.icu-project.org/trac/ticket/9627). The best 129 * documentation at this point is in UTR 35: 130 * http://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns 131 * 132 * Future support for ICU4X 133 * ======================== 134 * This implementation exposes a components bag, and internally handles the 135 * complexity of working with skeletons and patterns to generate the correct 136 * results. In the future, if and when we switch to ICU4X, the complexities of 137 * manipulating patterns will be able to be removed, as ICU4X will directly know 138 * how to apply the components bag. 139 */ 140 class DateTimeFormat final { 141 public: 142 /** 143 * The hour cycle for components. 144 */ 145 enum class HourCycle { 146 H11, 147 H12, 148 H23, 149 H24, 150 }; 151 152 /** 153 * The style for dates or times. 154 */ 155 enum class Style { 156 Full, 157 Long, 158 Medium, 159 Short, 160 }; 161 162 /** 163 * A bag of options to determine the length of the time and date styles. The 164 * hour cycle can be overridden. 165 */ 166 struct StyleBag { 167 Maybe<Style> date = Nothing(); 168 Maybe<Style> time = Nothing(); 169 Maybe<HourCycle> hourCycle = Nothing(); 170 Maybe<bool> hour12 = Nothing(); 171 }; 172 173 /** 174 * How to to display numeric components such as the year and the day. 175 */ 176 enum class Numeric { 177 Numeric, 178 TwoDigit, 179 }; 180 181 /** 182 * How to display the text components, such as the weekday or day period. 183 */ 184 enum class Text { 185 Long, 186 Short, 187 Narrow, 188 }; 189 190 /** 191 * How to display the month. 192 */ 193 enum class Month { 194 Numeric, 195 TwoDigit, 196 Long, 197 Short, 198 Narrow, 199 }; 200 201 /** 202 * How to display the time zone name. 203 */ 204 enum class TimeZoneName { 205 Long, 206 Short, 207 ShortOffset, 208 LongOffset, 209 ShortGeneric, 210 LongGeneric, 211 }; 212 213 /** 214 * Get static strings representing the enums. These match ECMA-402's resolved 215 * options. 216 * https://tc39.es/ecma402/#sec-intl.datetimeformat.prototype.resolvedoptions 217 */ 218 static const char* ToString(DateTimeFormat::HourCycle aHourCycle); 219 static const char* ToString(DateTimeFormat::Style aStyle); 220 static const char* ToString(DateTimeFormat::Numeric aNumeric); 221 static const char* ToString(DateTimeFormat::Text aText); 222 static const char* ToString(DateTimeFormat::Month aMonth); 223 static const char* ToString(DateTimeFormat::TimeZoneName aTimeZoneName); 224 225 /** 226 * A components bag specifies the components used to display a DateTime. Each 227 * component can be styled individually, and ICU will attempt to create a best 228 * match for a given locale. 229 */ 230 struct ComponentsBag { 231 Maybe<Text> era = Nothing(); 232 Maybe<Numeric> year = Nothing(); 233 Maybe<Month> month = Nothing(); 234 Maybe<Numeric> day = Nothing(); 235 Maybe<Text> weekday = Nothing(); 236 Maybe<Numeric> hour = Nothing(); 237 Maybe<Numeric> minute = Nothing(); 238 Maybe<Numeric> second = Nothing(); 239 Maybe<TimeZoneName> timeZoneName = Nothing(); 240 Maybe<bool> hour12 = Nothing(); 241 Maybe<HourCycle> hourCycle = Nothing(); 242 Maybe<Text> dayPeriod = Nothing(); 243 Maybe<uint8_t> fractionalSecondDigits = Nothing(); 244 }; 245 246 // Do not allow copy as this class owns the ICU resource. Move is not 247 // currently implemented, but a custom move operator could be created if 248 // needed. 249 DateTimeFormat(const DateTimeFormat&) = delete; 250 DateTimeFormat& operator=(const DateTimeFormat&) = delete; 251 252 // mozilla::Vector can avoid heap allocations for small transient buffers. 253 using PatternVector = Vector<char16_t, 128>; 254 using SkeletonVector = Vector<char16_t, 16>; 255 256 /** 257 * Create a DateTimeFormat from styles. 258 * 259 * The "style" model uses different options for formatting a date or time 260 * based on how the result will be styled, rather than picking specific 261 * fields or lengths. 262 * 263 * Takes an optional time zone which will override the user's default 264 * time zone. This is a UTF-16 string that takes the form "GMT±hh:mm", or 265 * an IANA time zone identifier, e.g. "America/Chicago". 266 */ 267 static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromStyle( 268 Span<const char> aLocale, const StyleBag& aStyleBag, 269 DateTimePatternGenerator* aDateTimePatternGenerator, 270 Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); 271 272 private: 273 /** 274 * Create a DateTimeFormat from a UTF-16 skeleton. 275 * 276 * A skeleton is an unordered list of fields that are used to find an 277 * appropriate date time format pattern. Example skeletons would be "yMd", 278 * "yMMMd", "EBhm". If the skeleton includes string literals or other 279 * information, it will be discarded when matching against skeletons. 280 * 281 * Takes an optional time zone which will override the user's default 282 * time zone. This is a string that takes the form "GMT±hh:mm", or 283 * an IANA time zone identifier, e.g. "America/Chicago". 284 */ 285 static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromSkeleton( 286 Span<const char> aLocale, Span<const char16_t> aSkeleton, 287 DateTimePatternGenerator* aDateTimePatternGenerator, 288 Maybe<DateTimeFormat::HourCycle> aHourCycle, 289 Maybe<Span<const char16_t>> aTimeZoneOverride); 290 291 public: 292 /** 293 * Create a DateTimeFormat from a ComponentsBag. 294 * 295 * See the ComponentsBag for additional documentation. 296 * 297 * Takes an optional time zone which will override the user's default 298 * time zone. This is a string that takes the form "GMT±hh:mm", or 299 * an IANA time zone identifier, e.g. "America/Chicago". 300 */ 301 static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromComponents( 302 Span<const char> aLocale, const ComponentsBag& bag, 303 DateTimePatternGenerator* aDateTimePatternGenerator, 304 Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); 305 306 /** 307 * Create a DateTimeFormat from a raw pattern. 308 * 309 * Warning: This method should not be added to new code. In the near future we 310 * plan to remove it. 311 */ 312 static Result<UniquePtr<DateTimeFormat>, ICUError> TryCreateFromPattern( 313 Span<const char> aLocale, Span<const char16_t> aPattern, 314 Maybe<Span<const char16_t>> aTimeZoneOverride = Nothing{}); 315 316 /** 317 * Use the format settings to format a date time into a string. The non-null 318 * terminated string will be placed into the provided buffer. The idea behind 319 * this API is that the constructor is expensive, and then the format 320 * operation is cheap. 321 * 322 * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC. 323 */ 324 template <typename B> 325 ICUResult TryFormat(double aUnixEpoch, B& aBuffer) const { 326 static_assert( 327 std::is_same_v<typename B::CharType, unsigned char> || 328 std::is_same_v<typename B::CharType, char> || 329 std::is_same_v<typename B::CharType, char16_t>, 330 "The only buffer CharTypes supported by DateTimeFormat are char " 331 "(for UTF-8 support) and char16_t (for UTF-16 support)."); 332 333 if constexpr (std::is_same_v<typename B::CharType, char> || 334 std::is_same_v<typename B::CharType, unsigned char>) { 335 // The output buffer is UTF-8, but ICU uses UTF-16 internally. 336 337 // Write the formatted date into the u16Buffer. 338 PatternVector u16Vec; 339 340 auto result = FillBufferWithICUCall( 341 u16Vec, [this, &aUnixEpoch](UChar* target, int32_t length, 342 UErrorCode* status) { 343 return udat_format(mDateFormat, aUnixEpoch, target, length, 344 /* UFieldPosition* */ nullptr, status); 345 }); 346 if (result.isErr()) { 347 return result; 348 } 349 350 #if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 351 for (auto& c : u16Vec) { 352 if (IsSpecialSpace(c)) { 353 c = ' '; 354 } 355 } 356 #endif 357 358 if (!FillBuffer(u16Vec, aBuffer)) { 359 return Err(ICUError::OutOfMemory); 360 } 361 return Ok{}; 362 } else { 363 static_assert(std::is_same_v<typename B::CharType, char16_t>); 364 365 // The output buffer is UTF-16. ICU can output directly into this buffer. 366 auto result = FillBufferWithICUCall( 367 aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { 368 return udat_format(mDateFormat, aUnixEpoch, target, length, nullptr, 369 status); 370 }); 371 if (result.isErr()) { 372 return result; 373 } 374 375 #if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 376 for (auto& c : Span(aBuffer.data(), aBuffer.length())) { 377 if (IsSpecialSpace(c)) { 378 c = ' '; 379 } 380 } 381 #endif 382 383 return Ok{}; 384 } 385 }; 386 387 /** 388 * Format the Unix epoch time into a DateTimePartVector. 389 * 390 * The caller has to create the buffer and the vector and pass to this method. 391 * The formatted string will be stored in the buffer and formatted parts in 392 * the vector. 393 * 394 * aUnixEpoch is the number of milliseconds since 1 January 1970, UTC. 395 * 396 * See: 397 * https://tc39.es/ecma402/#sec-formatdatetimetoparts 398 */ 399 template <typename B> 400 ICUResult TryFormatToParts(double aUnixEpoch, B& aBuffer, 401 DateTimePartVector& aParts) const { 402 static_assert(std::is_same_v<typename B::CharType, char16_t>, 403 "Only char16_t is supported (for UTF-16 support) now."); 404 405 UErrorCode status = U_ZERO_ERROR; 406 UFieldPositionIterator* fpositer = ufieldpositer_open(&status); 407 if (U_FAILURE(status)) { 408 return Err(ToICUError(status)); 409 } 410 411 auto result = FillBufferWithICUCall( 412 aBuffer, [this, aUnixEpoch, fpositer](UChar* chars, int32_t size, 413 UErrorCode* status) { 414 return udat_formatForFields(mDateFormat, aUnixEpoch, chars, size, 415 fpositer, status); 416 }); 417 if (result.isErr()) { 418 ufieldpositer_close(fpositer); 419 return result.propagateErr(); 420 } 421 422 #if DATE_TIME_FORMAT_REPLACE_SPECIAL_SPACES 423 for (auto& c : Span(aBuffer.data(), aBuffer.length())) { 424 if (IsSpecialSpace(c)) { 425 c = ' '; 426 } 427 } 428 #endif 429 430 return TryFormatToParts(fpositer, aBuffer.length(), aParts); 431 } 432 433 /** 434 * Copies the pattern for the current DateTimeFormat to a buffer. 435 * 436 * Warning: This method should not be added to new code. In the near future we 437 * plan to remove it. 438 */ 439 template <typename B> 440 ICUResult GetPattern(B& aBuffer) const { 441 return FillBufferWithICUCall( 442 aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) { 443 return udat_toPattern(mDateFormat, /* localized*/ false, target, 444 length, status); 445 }); 446 } 447 448 /** 449 * Copies the skeleton that was used to generate the current DateTimeFormat to 450 * the given buffer. If no skeleton was used, then a skeleton is generated 451 * from the resolved pattern. Note that going from skeleton -> resolved 452 * pattern -> skeleton is not a 1:1 mapping, as the resolved pattern can 453 * contain different symbols than the requested skeleton. 454 * 455 * Warning: This method should not be added to new code. In the near future we 456 * plan to remove it. 457 */ 458 template <typename B> 459 ICUResult GetOriginalSkeleton(B& aBuffer) { 460 static_assert(std::is_same_v<typename B::CharType, char16_t>); 461 if (mOriginalSkeleton.length() == 0) { 462 // Generate a skeleton from the resolved pattern, there was no originally 463 // cached skeleton. 464 PatternVector pattern{}; 465 VectorToBufferAdaptor buffer(pattern); 466 MOZ_TRY(GetPattern(buffer)); 467 468 VectorToBufferAdaptor skeleton(mOriginalSkeleton); 469 MOZ_TRY(DateTimePatternGenerator::GetSkeleton(pattern, skeleton)); 470 } 471 472 if (!FillBuffer(mOriginalSkeleton, aBuffer)) { 473 return Err(ICUError::OutOfMemory); 474 } 475 return Ok(); 476 } 477 478 /** 479 * Determines the resolved components for the current DateTimeFormat. 480 * 481 * When a DateTimeFormat is created, even from a components bag, the resolved 482 * formatter may tweak the resolved components depending on the configuration 483 * and the locale. 484 * 485 * For the implementation, with ICU4C, this takes a string pattern and maps it 486 * back to a ComponentsBag. 487 */ 488 Result<ComponentsBag, ICUError> ResolveComponents(); 489 490 ~DateTimeFormat(); 491 492 /** 493 * Clones the Calendar from a DateTimeFormat, and sets its time with the 494 * relative milliseconds since 1 January 1970, UTC. 495 */ 496 Result<UniquePtr<Calendar>, ICUError> CloneCalendar(double aUnixEpoch) const; 497 498 /** 499 * Return the hour cycle used in the input pattern or Nothing if none was 500 * found. 501 */ 502 static Maybe<DateTimeFormat::HourCycle> HourCycleFromPattern( 503 Span<const char16_t> aPattern); 504 505 using HourCyclesVector = Vector<HourCycle, 4>; 506 507 /** 508 * Returns the allowed hour cycles for the input locale. 509 * 510 * NOTE: This function currently takes a language subtag and an optional 511 * region subtag. This is a restriction until bug 1719746 has migrated 512 * language tag processing into the unified Intl component. After bug 1719746, 513 * this function should be changed to accept a single locale tag. 514 */ 515 static Result<HourCyclesVector, ICUError> GetAllowedHourCycles( 516 Span<const char> aLanguage, Maybe<Span<const char>> aRegion); 517 518 /** 519 * Returns an iterator over all supported date-time formatter locales. 520 * 521 * The returned strings are ICU locale identifiers and NOT BCP 47 language 522 * tags. 523 * 524 * Also see <https://unicode-org.github.io/icu/userguide/locale>. 525 */ 526 static auto GetAvailableLocales() { 527 return AvailableLocalesEnumeration<udat_countAvailable, 528 udat_getAvailable>(); 529 } 530 531 /** 532 * Return the time separator for the given locale and numbering system. 533 */ 534 template <typename B> 535 static ICUResult GetTimeSeparator(Span<const char> aLocale, 536 Span<const char> aNumberingSystem, 537 B& aBuffer) { 538 static_assert(std::is_same_v<typename B::CharType, char16_t>); 539 auto separator = GetTimeSeparator(aLocale, aNumberingSystem); 540 if (separator.isErr()) { 541 return separator.propagateErr(); 542 } 543 if (!FillBuffer(separator.unwrap(), aBuffer)) { 544 return Err(ICUError::OutOfMemory); 545 } 546 return Ok(); 547 } 548 549 private: 550 explicit DateTimeFormat(UDateFormat* aDateFormat); 551 552 ICUResult CacheSkeleton(Span<const char16_t> aSkeleton); 553 554 ICUResult TryFormatToParts(UFieldPositionIterator* aFieldPositionIterator, 555 size_t aSpanSize, 556 DateTimePartVector& aParts) const; 557 /** 558 * Replaces all hour pattern characters in |patternOrSkeleton| to use the 559 * matching hour representation for |hourCycle|. 560 */ 561 static void ReplaceHourSymbol(Span<char16_t> aPatternOrSkeleton, 562 DateTimeFormat::HourCycle aHourCycle); 563 564 /** 565 * Find a matching pattern using the requested hour-12 options. 566 * 567 * This function is needed to work around the following two issues. 568 * - https://unicode-org.atlassian.net/browse/ICU-21023 569 * - https://unicode-org.atlassian.net/browse/CLDR-13425 570 * 571 * We're currently using a relatively simple workaround, which doesn't give 572 * the most accurate results. For example: 573 * 574 * ``` 575 * var dtf = new Intl.DateTimeFormat("en", { 576 * timeZone: "UTC", 577 * dateStyle: "long", 578 * timeStyle: "long", 579 * hourCycle: "h12", 580 * }); 581 * print(dtf.format(new Date("2020-01-01T00:00Z"))); 582 * ``` 583 * 584 * Returns the pattern "MMMM d, y 'at' h:mm:ss a z", but when going through 585 * |DateTimePatternGenerator::GetSkeleton| and then 586 * |DateTimePatternGenerator::GetBestPattern| to find an equivalent pattern 587 * for "h23", we'll end up with the pattern "MMMM d, y, HH:mm:ss z", so the 588 * combinator element " 'at' " was lost in the process. 589 */ 590 static ICUResult FindPatternWithHourCycle( 591 DateTimePatternGenerator& aDateTimePatternGenerator, 592 DateTimeFormat::PatternVector& aPattern, bool aHour12, 593 DateTimeFormat::SkeletonVector& aSkeleton); 594 595 static Result<Span<const char16_t>, ICUError> GetTimeSeparator( 596 Span<const char> aLocale, Span<const char> aNumberingSystem); 597 598 UDateFormat* mDateFormat = nullptr; 599 600 SkeletonVector mOriginalSkeleton; 601 }; 602 603 } // namespace mozilla::intl 604 605 #endif