tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsUnicodeProperties.h (7173B)


      1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=4 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef NS_UNICODEPROPERTIES_H
      8 #define NS_UNICODEPROPERTIES_H
      9 
     10 #include "mozilla/intl/UnicodeProperties.h"
     11 
     12 #include "mozilla/Span.h"
     13 #include "nsBidiUtils.h"
     14 #include "nsUGenCategory.h"
     15 #include "harfbuzz/hb.h"
     16 
     17 namespace mozilla {
     18 
     19 namespace unicode {
     20 
     21 extern const nsUGenCategory sDetailedToGeneralCategory[];
     22 
     23 /* This values must match the values by UVerticalOrientation by ICU */
     24 enum VerticalOrientation {
     25  VERTICAL_ORIENTATION_R = 0,
     26  VERTICAL_ORIENTATION_Tr = 1,
     27  VERTICAL_ORIENTATION_Tu = 2,
     28  VERTICAL_ORIENTATION_U = 3,
     29 };
     30 
     31 /* This MUST match the values assigned by genUnicodePropertyData.pl! */
     32 enum PairedBracketType {
     33  PAIRED_BRACKET_TYPE_NONE = 0,
     34  PAIRED_BRACKET_TYPE_OPEN = 1,
     35  PAIRED_BRACKET_TYPE_CLOSE = 2
     36 };
     37 
     38 /* This values must match the values by UIdentifierStatus by ICU */
     39 enum IdentifierType {
     40  IDTYPE_RESTRICTED = 0,
     41  IDTYPE_ALLOWED = 1,
     42 };
     43 
     44 enum EmojiPresentation { TextOnly = 0, TextDefault = 1, EmojiDefault = 2 };
     45 
     46 const uint32_t kVariationSelector15 = 0xFE0E;  // text presentation
     47 const uint32_t kVariationSelector16 = 0xFE0F;  // emoji presentation
     48 static inline bool IsEmojiPresentationSelector(uint32_t aCh) {
     49  return aCh >= kVariationSelector15 && aCh <= kVariationSelector16;
     50 }
     51 
     52 // Unicode values for EMOJI MODIFIER FITZPATRICK TYPE-*
     53 const uint32_t kEmojiSkinToneFirst = 0x1f3fb;
     54 const uint32_t kEmojiSkinToneLast = 0x1f3ff;
     55 static inline bool IsEmojiSkinToneModifier(uint32_t aCh) {
     56  return aCh >= kEmojiSkinToneFirst && aCh <= kEmojiSkinToneLast;
     57 }
     58 
     59 extern const hb_unicode_general_category_t sICUtoHBcategory[];
     60 
     61 // NOTE: This returns values matching harfbuzz HB_UNICODE_GENERAL_CATEGORY_*
     62 // constants, NOT the mozilla::intl::GeneralCategory enum.
     63 // For the GeneralCategory enum, use intl::UnicodeProperties::CharType itself.
     64 inline uint8_t GetGeneralCategory(uint32_t aCh) {
     65  return sICUtoHBcategory[unsigned(intl::UnicodeProperties::CharType(aCh))];
     66 }
     67 
     68 inline int8_t GetNumericValue(uint32_t aCh) {
     69  return intl::UnicodeProperties::GetNumericValue(aCh);
     70 }
     71 
     72 inline uint8_t GetLineBreakClass(uint32_t aCh) {
     73  return intl::UnicodeProperties::GetIntPropertyValue(
     74      aCh, intl::UnicodeProperties::IntProperty::LineBreak);
     75 }
     76 
     77 inline uint32_t GetScriptTagForCode(intl::Script aScriptCode) {
     78  const char* tag = intl::UnicodeProperties::GetScriptShortName(aScriptCode);
     79  if (tag) {
     80    return HB_TAG(tag[0], tag[1], tag[2], tag[3]);
     81  }
     82  // return UNKNOWN script tag (running with older ICU?)
     83  return HB_SCRIPT_UNKNOWN;
     84 }
     85 
     86 inline PairedBracketType GetPairedBracketType(uint32_t aCh) {
     87  return PairedBracketType(intl::UnicodeProperties::GetIntPropertyValue(
     88      aCh, intl::UnicodeProperties::IntProperty::BidiPairedBracketType));
     89 }
     90 
     91 inline uint32_t GetTitlecaseForLower(
     92    uint32_t aCh)  // maps LC to titlecase, UC unchanged
     93 {
     94  return intl::UnicodeProperties::IsLowercase(aCh)
     95             ? intl::UnicodeProperties::ToTitle(aCh)
     96             : aCh;
     97 }
     98 
     99 inline uint32_t GetTitlecaseForAll(
    100    uint32_t aCh)  // maps both UC and LC to titlecase
    101 {
    102  return intl::UnicodeProperties::ToTitle(aCh);
    103 }
    104 
    105 inline uint32_t GetFoldedcase(uint32_t aCh) {
    106  // Handle dotted capital I and dotless small i specially because we want to
    107  // use a combination of ordinary case-folding rules and Turkish case-folding
    108  // rules.
    109  if (aCh == 0x0130 || aCh == 0x0131) {
    110    return 'i';
    111  }
    112  return intl::UnicodeProperties::FoldCase(aCh);
    113 }
    114 
    115 inline bool IsDefaultIgnorable(uint32_t aCh) {
    116  return intl::UnicodeProperties::HasBinaryProperty(
    117      aCh, intl::UnicodeProperties::BinaryProperty::DefaultIgnorableCodePoint);
    118 }
    119 
    120 inline EmojiPresentation GetEmojiPresentation(uint32_t aCh) {
    121  if (!intl::UnicodeProperties::HasBinaryProperty(
    122          aCh, intl::UnicodeProperties::BinaryProperty::Emoji)) {
    123    return TextOnly;
    124  }
    125 
    126  if (intl::UnicodeProperties::HasBinaryProperty(
    127          aCh, intl::UnicodeProperties::BinaryProperty::EmojiPresentation)) {
    128    return EmojiDefault;
    129  }
    130  return TextDefault;
    131 }
    132 
    133 // returns the simplified Gen Category as defined in nsUGenCategory
    134 inline nsUGenCategory GetGenCategory(uint32_t aCh) {
    135  return sDetailedToGeneralCategory[GetGeneralCategory(aCh)];
    136 }
    137 
    138 inline VerticalOrientation GetVerticalOrientation(uint32_t aCh) {
    139  return VerticalOrientation(intl::UnicodeProperties::GetIntPropertyValue(
    140      aCh, intl::UnicodeProperties::IntProperty::VerticalOrientation));
    141 }
    142 
    143 inline IdentifierType GetIdentifierType(uint32_t aCh) {
    144  return IdentifierType(intl::UnicodeProperties::GetIntPropertyValue(
    145      aCh, intl::UnicodeProperties::IntProperty::IdentifierStatus));
    146 }
    147 
    148 uint32_t GetFullWidth(uint32_t aCh);
    149 // This is the reverse function of GetFullWidth which guarantees that
    150 // for every codepoint c, GetFullWidthInverse(GetFullWidth(c)) == c.
    151 // Note that, this function does not guarantee to convert all wide
    152 // form characters to their possible narrow form.
    153 uint32_t GetFullWidthInverse(uint32_t aCh);
    154 
    155 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory);
    156 
    157 inline bool IsClusterExtender(uint32_t aCh) {
    158  // There are no cluster-extender characters before the first combining-
    159  // character block at U+03xx, so we short-circuit here to avoid the cost
    160  // of calling GetGeneralCategory for Latin-1 letters etc.
    161  return aCh >= 0x0300 && IsClusterExtender(aCh, GetGeneralCategory(aCh));
    162 }
    163 
    164 bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory);
    165 
    166 inline bool IsClusterExtenderExcludingJoiners(uint32_t aCh) {
    167  return aCh >= 0x0300 &&
    168         IsClusterExtenderExcludingJoiners(aCh, GetGeneralCategory(aCh));
    169 }
    170 
    171 // Count the number of grapheme clusters in the given string
    172 uint32_t CountGraphemeClusters(Span<const char16_t> aText);
    173 
    174 // Determine whether a character is a "combining diacritic" for the purpose
    175 // of diacritic-insensitive text search. Examples of such characters include
    176 // European accents and Hebrew niqqud, but not Hangul components or Thaana
    177 // vowels, even though Thaana vowels are combining nonspacing marks that could
    178 // be considered diacritics.
    179 // As an exception to strictly following Unicode properties, we exclude the
    180 // Japanese kana voicing marks
    181 //   3099;COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK;Mn;8;NSM
    182 //   309A;COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK;Mn;8;NSM
    183 // which users report should not be ignored (bug 1624244).
    184 // See is_combining_diacritic in base_chars.py and is_combining_diacritic.py.
    185 //
    186 // TODO: once ICU4X is integrated (replacing ICU4C) as the source of Unicode
    187 // properties, re-evaluate whether building the static bitset is worthwhile
    188 // or if we can revert to simply getting the combining class and comparing
    189 // to the values we care about at runtime.
    190 bool IsCombiningDiacritic(uint32_t aCh);
    191 
    192 // Remove diacritics from a character
    193 uint32_t GetNaked(uint32_t aCh);
    194 
    195 }  // end namespace unicode
    196 
    197 }  // end namespace mozilla
    198 
    199 #endif /* NS_UNICODEPROPERTIES_H */