tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

String.h (8320B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifndef intl_components_String_h_
      6 #define intl_components_String_h_
      7 
      8 #include "mozilla/Assertions.h"
      9 #include "mozilla/Casting.h"
     10 #include "mozilla/intl/ICU4CGlue.h"
     11 #include "mozilla/intl/ICUError.h"
     12 #include "mozilla/PodOperations.h"
     13 #include "mozilla/Span.h"
     14 #include "mozilla/Try.h"
     15 
     16 #include "unicode/uchar.h"
     17 #include "unicode/unorm2.h"
     18 #include "unicode/ustring.h"
     19 #include "unicode/utext.h"
     20 #include "unicode/utypes.h"
     21 
     22 namespace mozilla::intl {
     23 
     24 /**
     25 * This component is a Mozilla-focused API for working with strings in
     26 * internationalization code.
     27 */
     28 class String final {
     29 public:
     30  String() = delete;
     31 
     32  /**
     33   * Return the locale-sensitive lower case string of the input.
     34   */
     35  template <typename B>
     36  static Result<Ok, ICUError> ToLocaleLowerCase(const char* aLocale,
     37                                                Span<const char16_t> aString,
     38                                                B& aBuffer) {
     39    if (!aBuffer.reserve(aString.size())) {
     40      return Err(ICUError::OutOfMemory);
     41    }
     42    return FillBufferWithICUCall(
     43        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
     44          return u_strToLower(target, length, aString.data(), aString.size(),
     45                              aLocale, status);
     46        });
     47  }
     48 
     49  /**
     50   * Return the locale-sensitive upper case string of the input.
     51   */
     52  template <typename B>
     53  static Result<Ok, ICUError> ToLocaleUpperCase(const char* aLocale,
     54                                                Span<const char16_t> aString,
     55                                                B& aBuffer) {
     56    if (!aBuffer.reserve(aString.size())) {
     57      return Err(ICUError::OutOfMemory);
     58    }
     59    return FillBufferWithICUCall(
     60        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
     61          return u_strToUpper(target, length, aString.data(), aString.size(),
     62                              aLocale, status);
     63        });
     64  }
     65 
     66  /**
     67   * Normalization form constants to describe which normalization algorithm
     68   * should be performed.
     69   *
     70   * Also see:
     71   * - Unicode Standard, §2.12 Equivalent Sequences
     72   * - Unicode Standard, §3.11 Normalization Forms
     73   * - https://unicode.org/reports/tr15/
     74   */
     75  enum class NormalizationForm {
     76    /**
     77     * Normalization Form C
     78     */
     79    NFC,
     80 
     81    /**
     82     * Normalization Form D
     83     */
     84    NFD,
     85 
     86    /**
     87     * Normalization Form KC
     88     */
     89    NFKC,
     90 
     91    /**
     92     * Normalization Form KD
     93     */
     94    NFKD,
     95  };
     96 
     97  enum class AlreadyNormalized : bool { No, Yes };
     98 
     99  /**
    100   * Normalize the input string according to requested normalization form.
    101   *
    102   * Returns `AlreadyNormalized::Yes` when the string is already in normalized
    103   * form. The output buffer is unchanged in this case. Otherwise returns
    104   * `AlreadyNormalized::No` and places the normalized string into the output
    105   * buffer.
    106   */
    107  template <typename B>
    108  static Result<AlreadyNormalized, ICUError> Normalize(
    109      NormalizationForm aForm, Span<const char16_t> aString, B& aBuffer) {
    110    // The unorm2_getXXXInstance() methods return a shared instance which must
    111    // not be deleted.
    112    UErrorCode status = U_ZERO_ERROR;
    113    const UNormalizer2* normalizer;
    114    switch (aForm) {
    115      case NormalizationForm::NFC:
    116        normalizer = unorm2_getNFCInstance(&status);
    117        break;
    118      case NormalizationForm::NFD:
    119        normalizer = unorm2_getNFDInstance(&status);
    120        break;
    121      case NormalizationForm::NFKC:
    122        normalizer = unorm2_getNFKCInstance(&status);
    123        break;
    124      case NormalizationForm::NFKD:
    125        normalizer = unorm2_getNFKDInstance(&status);
    126        break;
    127    }
    128    if (U_FAILURE(status)) {
    129      return Err(ToICUError(status));
    130    }
    131 
    132    int32_t spanLengthInt = unorm2_spanQuickCheckYes(normalizer, aString.data(),
    133                                                     aString.size(), &status);
    134    if (U_FAILURE(status)) {
    135      return Err(ToICUError(status));
    136    }
    137 
    138    size_t spanLength = AssertedCast<size_t>(spanLengthInt);
    139    MOZ_ASSERT(spanLength <= aString.size());
    140 
    141    // Return if the input string is already normalized.
    142    if (spanLength == aString.size()) {
    143      return AlreadyNormalized::Yes;
    144    }
    145 
    146    if (!aBuffer.reserve(aString.size())) {
    147      return Err(ICUError::OutOfMemory);
    148    }
    149 
    150    // Copy the already normalized prefix.
    151    if (spanLength > 0) {
    152      PodCopy(aBuffer.data(), aString.data(), spanLength);
    153 
    154      aBuffer.written(spanLength);
    155    }
    156 
    157    MOZ_TRY(FillBufferWithICUCall(
    158        aBuffer, [&](UChar* target, int32_t length, UErrorCode* status) {
    159          Span<const char16_t> remaining = aString.From(spanLength);
    160          return unorm2_normalizeSecondAndAppend(normalizer, target, spanLength,
    161                                                 length, remaining.data(),
    162                                                 remaining.size(), status);
    163        }));
    164 
    165    return AlreadyNormalized::No;
    166  }
    167 
    168  /**
    169   * Return true if the code point has the binary property "Cased".
    170   */
    171  static bool IsCased(char32_t codePoint) {
    172    return u_hasBinaryProperty(static_cast<UChar32>(codePoint), UCHAR_CASED);
    173  }
    174 
    175  /**
    176   * Return true if the code point has the binary property "Case_Ignorable".
    177   */
    178  static bool IsCaseIgnorable(char32_t codePoint) {
    179    return u_hasBinaryProperty(static_cast<UChar32>(codePoint),
    180                               UCHAR_CASE_IGNORABLE);
    181  }
    182 
    183  /**
    184   * Return the NFC pairwise composition of the two input characters, if any;
    185   * returns 0 (which we know is not a composed char!) if none exists.
    186   */
    187  static char32_t ComposePairNFC(char32_t a, char32_t b) {
    188    // unorm2_getNFCInstance returns a static instance that does not have to be
    189    // released here. If it fails, we just return 0 (no composition) always.
    190    static UErrorCode status = U_ZERO_ERROR;
    191    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    192    if (U_FAILURE(status)) {
    193      return 0;
    194    }
    195    UChar32 ch = unorm2_composePair(normalizer, static_cast<UChar32>(a),
    196                                    static_cast<UChar32>(b));
    197    return ch < 0 ? 0 : static_cast<char32_t>(ch);
    198  }
    199 
    200  /**
    201   * Put the "raw" (single-level) canonical decomposition of the input char, if
    202   * any, into the provided buffer. Canonical decomps are never more than two
    203   * chars in length (although full normalization may result in longer output
    204   * due to recursion).
    205   * Returns the length of the decomposition (0 if none, else 1 or 2).
    206   */
    207  static int DecomposeRawNFD(char32_t ab, char32_t decomp[2]) {
    208    // unorm2_getNFCInstance returns a static instance that does not have to be
    209    // released here. If it fails, we just return 0 (no decomposition) always.
    210    // Although we are using it to query for a decomposition, the mode of the
    211    // Normalizer2 is irrelevant here, so we may as well use the same singleton
    212    // instance as ComposePairNFC.
    213    static UErrorCode status = U_ZERO_ERROR;
    214    static const UNormalizer2* normalizer = unorm2_getNFCInstance(&status);
    215    if (U_FAILURE(status)) {
    216      return 0;
    217    }
    218 
    219    // Canonical decompositions are never more than two Unicode characters,
    220    // or a maximum of 4 utf-16 code units.
    221    const unsigned MAX_DECOMP_LENGTH = 4;
    222    UErrorCode error = U_ZERO_ERROR;
    223    UChar decompUtf16[MAX_DECOMP_LENGTH];
    224    int32_t len =
    225        unorm2_getRawDecomposition(normalizer, static_cast<UChar32>(ab),
    226                                   decompUtf16, MAX_DECOMP_LENGTH, &error);
    227    if (U_FAILURE(error) || len < 0) {
    228      return 0;
    229    }
    230    UText text = UTEXT_INITIALIZER;
    231    utext_openUChars(&text, decompUtf16, len, &error);
    232    MOZ_ASSERT(U_SUCCESS(error));
    233    UChar32 ch = UTEXT_NEXT32(&text);
    234    len = 0;
    235    if (ch != U_SENTINEL) {
    236      decomp[0] = static_cast<char32_t>(ch);
    237      ++len;
    238      ch = UTEXT_NEXT32(&text);
    239      if (ch != U_SENTINEL) {
    240        decomp[1] = static_cast<char32_t>(ch);
    241        ++len;
    242      }
    243    }
    244    utext_close(&text);
    245    return len;
    246  }
    247 
    248  /**
    249   * Return the Unicode version, for example "13.0".
    250   */
    251  static Span<const char> GetUnicodeVersion();
    252 };
    253 
    254 }  // namespace mozilla::intl
    255 
    256 #endif