tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

UnicodeProperties.h (12676B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 #ifndef intl_components_UnicodeProperties_h_
      5 #define intl_components_UnicodeProperties_h_
      6 
      7 #include "mozilla/intl/BidiClass.h"
      8 #include "mozilla/intl/GeneralCategory.h"
      9 #include "mozilla/intl/ICU4CGlue.h"
     10 #include "mozilla/intl/UnicodeScriptCodes.h"
     11 #include "mozilla/Vector.h"
     12 
     13 #include "unicode/uchar.h"
     14 #include "unicode/uscript.h"
     15 
     16 namespace mozilla::intl {
     17 
     18 /**
     19 * This component is a Mozilla-focused API for working with text properties.
     20 */
     21 class UnicodeProperties final {
     22 public:
     23  /**
     24   * Return the BidiClass for the character.
     25   */
     26  static inline BidiClass GetBidiClass(uint32_t aCh) {
     27    return BidiClass(u_charDirection(aCh));
     28  }
     29 
     30  /**
     31   * Maps the specified character to a "mirror-image" character.
     32   */
     33  static inline uint32_t CharMirror(uint32_t aCh) { return u_charMirror(aCh); }
     34 
     35  /**
     36   * Return the general category value for the code point.
     37   */
     38  static inline GeneralCategory CharType(uint32_t aCh) {
     39    return GeneralCategory(u_charType(aCh));
     40  }
     41 
     42  /**
     43   * Determine whether the code point has the Bidi_Mirrored property.
     44   */
     45  static inline bool IsMirrored(uint32_t aCh) { return u_isMirrored(aCh); }
     46 
     47  /**
     48   * Returns the combining class of the code point as specified in
     49   * UnicodeData.txt.
     50   */
     51  static inline uint8_t GetCombiningClass(uint32_t aCh) {
     52    return u_getCombiningClass(aCh);
     53  }
     54 
     55  enum class IntProperty {
     56    BidiPairedBracketType,
     57    EastAsianWidth,
     58    HangulSyllableType,
     59    IdentifierStatus,
     60    LineBreak,
     61    NumericType,
     62    VerticalOrientation,
     63  };
     64 
     65  /**
     66   * Get the property value for an enumerated or integer Unicode property for a
     67   * code point.
     68   */
     69  static inline int32_t GetIntPropertyValue(uint32_t aCh, IntProperty aProp) {
     70    UProperty prop;
     71    switch (aProp) {
     72      case IntProperty::BidiPairedBracketType:
     73        prop = UCHAR_BIDI_PAIRED_BRACKET_TYPE;
     74        break;
     75      case IntProperty::EastAsianWidth:
     76        prop = UCHAR_EAST_ASIAN_WIDTH;
     77        break;
     78      case IntProperty::HangulSyllableType:
     79        prop = UCHAR_HANGUL_SYLLABLE_TYPE;
     80        break;
     81      case IntProperty::LineBreak:
     82        prop = UCHAR_LINE_BREAK;
     83        break;
     84      case IntProperty::NumericType:
     85        prop = UCHAR_NUMERIC_TYPE;
     86        break;
     87      case IntProperty::VerticalOrientation:
     88        prop = UCHAR_VERTICAL_ORIENTATION;
     89        break;
     90      case IntProperty::IdentifierStatus:
     91        prop = UCHAR_IDENTIFIER_STATUS;
     92        break;
     93    }
     94    return u_getIntPropertyValue(aCh, prop);
     95  }
     96 
     97  /**
     98   * Get the numeric value for a Unicode code point as defined in the
     99   * Unicode Character Database if the input is decimal or a digit,
    100   * otherwise, returns -1.
    101   */
    102  static inline int8_t GetNumericValue(uint32_t aCh) {
    103    UNumericType type =
    104        UNumericType(GetIntPropertyValue(aCh, IntProperty::NumericType));
    105    return type == U_NT_DECIMAL || type == U_NT_DIGIT
    106               ? int8_t(u_getNumericValue(aCh))
    107               : -1;
    108  }
    109 
    110  /**
    111   * Maps the specified character to its paired bracket character.
    112   */
    113  static inline uint32_t GetBidiPairedBracket(uint32_t aCh) {
    114    return u_getBidiPairedBracket(aCh);
    115  }
    116 
    117  /**
    118   * The given character is mapped to its uppercase equivalent according to
    119   * UnicodeData.txt; if the character has no uppercase equivalent, the
    120   * character itself is returned.
    121   */
    122  static inline uint32_t ToUpper(uint32_t aCh) { return u_toupper(aCh); }
    123 
    124  /**
    125   * The given character is mapped to its lowercase equivalent according to
    126   * UnicodeData.txt; if the character has no lowercase equivalent, the
    127   * character itself is returned.
    128   */
    129  static inline uint32_t ToLower(uint32_t aCh) { return u_tolower(aCh); }
    130 
    131  /**
    132   * Check if a code point has the Lowercase Unicode property.
    133   */
    134  static inline bool IsLowercase(uint32_t aCh) { return u_isULowercase(aCh); }
    135 
    136  /**
    137   * The given character is mapped to its titlecase equivalent according to
    138   * UnicodeData.txt; if the character has no titlecase equivalent, the
    139   * character itself is returned.
    140   */
    141  static inline uint32_t ToTitle(uint32_t aCh) { return u_totitle(aCh); }
    142 
    143  /**
    144   * The given character is mapped to its case folding equivalent according to
    145   * UnicodeData.txt and CaseFolding.txt;
    146   * if the character has no case folding equivalent, the character
    147   * itself is returned.
    148   */
    149  static inline uint32_t FoldCase(uint32_t aCh) {
    150    return u_foldCase(aCh, U_FOLD_CASE_DEFAULT);
    151  }
    152 
    153  enum class BinaryProperty {
    154    DefaultIgnorableCodePoint,
    155    Emoji,
    156    EmojiPresentation,
    157  };
    158 
    159  /**
    160   * Check a binary Unicode property for a code point.
    161   */
    162  static inline bool HasBinaryProperty(uint32_t aCh, BinaryProperty aProp) {
    163    UProperty prop;
    164    switch (aProp) {
    165      case BinaryProperty::DefaultIgnorableCodePoint:
    166        prop = UCHAR_DEFAULT_IGNORABLE_CODE_POINT;
    167        break;
    168      case BinaryProperty::Emoji:
    169        prop = UCHAR_EMOJI;
    170        break;
    171      case BinaryProperty::EmojiPresentation:
    172        prop = UCHAR_EMOJI_PRESENTATION;
    173        break;
    174    }
    175    return u_hasBinaryProperty(aCh, prop);
    176  }
    177 
    178  /**
    179   * Check if the width of aCh is full width, half width or wide.
    180   */
    181  static inline bool IsEastAsianWidthFHW(uint32_t aCh) {
    182    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
    183      case U_EA_FULLWIDTH:
    184      case U_EA_HALFWIDTH:
    185      case U_EA_WIDE:
    186        return true;
    187      case U_EA_AMBIGUOUS:
    188      case U_EA_NARROW:
    189      case U_EA_NEUTRAL:
    190        return false;
    191    }
    192    return false;
    193  }
    194 
    195  /**
    196   * Check if the width of aCh is full width, half width or wide
    197   * excluding emoji.
    198   */
    199  static inline bool IsEastAsianWidthFHWexcludingEmoji(uint32_t aCh) {
    200    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
    201      case U_EA_FULLWIDTH:
    202      case U_EA_HALFWIDTH:
    203        return true;
    204      case U_EA_WIDE:
    205        return HasBinaryProperty(aCh, BinaryProperty::Emoji) ? false : true;
    206      case U_EA_AMBIGUOUS:
    207      case U_EA_NARROW:
    208      case U_EA_NEUTRAL:
    209        return false;
    210    }
    211    return false;
    212  }
    213 
    214  /**
    215   * Check if the width of aCh is ambiguous, full width, or wide.
    216   */
    217  static inline bool IsEastAsianWidthAFW(uint32_t aCh) {
    218    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
    219      case U_EA_AMBIGUOUS:
    220      case U_EA_FULLWIDTH:
    221      case U_EA_WIDE:
    222        return true;
    223      case U_EA_HALFWIDTH:
    224      case U_EA_NARROW:
    225      case U_EA_NEUTRAL:
    226        return false;
    227    }
    228    return false;
    229  }
    230 
    231  /**
    232   * Check if the width of aCh is full width, or wide.
    233   */
    234  static inline bool IsEastAsianWidthFW(uint32_t aCh) {
    235    switch (GetIntPropertyValue(aCh, IntProperty::EastAsianWidth)) {
    236      case U_EA_FULLWIDTH:
    237      case U_EA_WIDE:
    238        return true;
    239      case U_EA_AMBIGUOUS:
    240      case U_EA_HALFWIDTH:
    241      case U_EA_NARROW:
    242      case U_EA_NEUTRAL:
    243        return false;
    244    }
    245    return false;
    246  }
    247 
    248  /**
    249   * Check if the width of aCh is East Asian Fullwidth (F).
    250   */
    251  static inline bool IsEastAsianFullWidth(char32_t aCh) {
    252    return GetIntPropertyValue(aCh, IntProperty::EastAsianWidth) ==
    253           U_EA_FULLWIDTH;
    254  }
    255 
    256  /**
    257   * Check if the CharType of aCh is a letter type.
    258   */
    259  static inline bool IsLetter(char32_t aCh) {
    260    switch (CharType(aCh)) {
    261      case GeneralCategory::Uppercase_Letter:
    262      case GeneralCategory::Lowercase_Letter:
    263      case GeneralCategory::Titlecase_Letter:
    264      case GeneralCategory::Modifier_Letter:
    265      case GeneralCategory::Other_Letter:
    266        return true;
    267      default:
    268        return false;
    269    }
    270  }
    271 
    272  /**
    273   * Check if the CharType of aCh is a combining mark type.
    274   */
    275  static inline bool IsCombiningMark(char32_t aCh) {
    276    switch (CharType(aCh)) {
    277      case GeneralCategory::Nonspacing_Mark:
    278      case GeneralCategory::Spacing_Mark:
    279      case GeneralCategory::Enclosing_Mark:
    280        return true;
    281      default:
    282        return false;
    283    }
    284  }
    285 
    286  /**
    287   * Check if the CharType of aCh is a punctuation type.
    288   */
    289  static inline bool IsPunctuation(uint32_t aCh) {
    290    switch (CharType(aCh)) {
    291      case GeneralCategory::Dash_Punctuation:
    292      case GeneralCategory::Open_Punctuation:
    293      case GeneralCategory::Close_Punctuation:
    294      case GeneralCategory::Connector_Punctuation:
    295      case GeneralCategory::Other_Punctuation:
    296      case GeneralCategory::Initial_Punctuation:
    297      case GeneralCategory::Final_Punctuation:
    298        return true;
    299      default:
    300        return false;
    301    }
    302  }
    303 
    304  /**
    305   * Check if the CharType of aCh is math or other symbol.
    306   */
    307  static inline bool IsMathOrMusicSymbol(uint32_t aCh) {
    308    // Keep this function in sync with is_math_symbol in base_chars.py.
    309    return CharType(aCh) == GeneralCategory::Math_Symbol ||
    310           CharType(aCh) == GeneralCategory::Other_Symbol;
    311  }
    312 
    313  static inline Script GetScriptCode(uint32_t aCh) {
    314    // We can safely ignore the error code here because uscript_getScript
    315    // returns USCRIPT_INVALID_CODE in the event of an error.
    316    UErrorCode err = U_ZERO_ERROR;
    317    return Script(uscript_getScript(aCh, &err));
    318  }
    319 
    320  static inline bool HasScript(uint32_t aCh, Script aScript) {
    321    return uscript_hasScript(aCh, UScriptCode(aScript));
    322  }
    323 
    324  static inline const char* GetScriptShortName(Script aScript) {
    325    return uscript_getShortName(UScriptCode(aScript));
    326  }
    327 
    328  static inline int32_t GetMaxNumberOfScripts() {
    329    return u_getIntPropertyMaxValue(UCHAR_SCRIPT);
    330  }
    331 
    332  // Return true if aChar belongs to a SEAsian script that is written without
    333  // word spaces, so we need to use the "complex breaker" to find possible word
    334  // boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua)
    335  static bool IsScriptioContinua(char16_t aChar) {
    336    Script sc = GetScriptCode(aChar);
    337    return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER ||
    338           sc == Script::JAVANESE || sc == Script::BALINESE ||
    339           sc == Script::SUNDANESE || sc == Script::LAO;
    340  }
    341 
    342  // Return true if aChar belongs to a cursive script for which inter-character
    343  // justification should be disabled.
    344  static bool IsCursiveScript(char32_t aChar) {
    345    Script sc = GetScriptCode(aChar);
    346    return sc == Script::ARABIC || sc == Script::SYRIAC || sc == Script::NKO ||
    347           sc == Script::MANDAIC || sc == Script::MONGOLIAN ||
    348           sc == Script::PHAGS_PA || sc == Script::HANIFI_ROHINGYA;
    349  }
    350 
    351  // The code point which has the most script extensions is 0x0965, which has 21
    352  // script extensions, so choose the vector size as 32 to prevent heap
    353  // allocation.
    354  static constexpr size_t kMaxScripts = 32;
    355 
    356  using ScriptExtensionVector = Vector<Script, kMaxScripts>;
    357 
    358  /**
    359   * Get the script extensions for the given code point, and write the script
    360   * extensions to aExtensions vector. If the code point has script extensions,
    361   * the script code (Script::COMMON or Script::INHERITED) will be excluded.
    362   *
    363   * If the code point doesn't have any script extension, then its script code
    364   * will be written to aExtensions vector.
    365   *
    366   * If the code point is invalid, Script::UNKNOWN will be written to
    367   * aExtensions vector.
    368   *
    369   * Note: aExtensions will be cleared after calling this method regardless of
    370   * failure.
    371   *
    372   * See [1] for the script code of the code point, [2] for the script
    373   * extensions.
    374   *
    375   * https://www.unicode.org/Public/UNIDATA/Scripts.txt
    376   * https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
    377   */
    378  static ICUResult GetExtensions(char32_t aCodePoint,
    379                                 ScriptExtensionVector& aExtensions) {
    380    // Clear the vector first.
    381    aExtensions.clear();
    382 
    383    // We cannot pass aExtensions to uscript_getScriptExtension as USCriptCode
    384    // takes 4 bytes, so create a local UScriptCode array to get the extensions.
    385    UScriptCode ext[kMaxScripts];
    386    UErrorCode status = U_ZERO_ERROR;
    387    int32_t len = uscript_getScriptExtensions(static_cast<UChar32>(aCodePoint),
    388                                              ext, kMaxScripts, &status);
    389    if (U_FAILURE(status)) {
    390      // kMaxScripts should be large enough to hold the maximun number of script
    391      // extensions.
    392      MOZ_DIAGNOSTIC_ASSERT(status != U_BUFFER_OVERFLOW_ERROR);
    393      return Err(ToICUError(status));
    394    }
    395 
    396    if (!aExtensions.reserve(len)) {
    397      return Err(ICUError::OutOfMemory);
    398    }
    399 
    400    for (int32_t i = 0; i < len; i++) {
    401      aExtensions.infallibleAppend(Script(ext[i]));
    402    }
    403 
    404    return Ok();
    405  }
    406 };
    407 
    408 }  // namespace mozilla::intl
    409 
    410 #endif