tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsUnicodeProperties.cpp (8342B)


      1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=4 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "nsUnicodeProperties.h"
      8 #include "nsUnicodePropertyData.cpp"
      9 
     10 #include "mozilla/intl/Segmenter.h"
     11 
     12 #include "BaseChars.h"
     13 #include "IsCombiningDiacritic.h"
     14 
     15 #define UNICODE_BMP_LIMIT 0x10000
     16 
     17 namespace mozilla {
     18 
     19 namespace unicode {
     20 
     21 /*
     22 To store properties for a million Unicode codepoints compactly, we use
     23 a three-level array structure, with the Unicode values considered as
     24 three elements: Plane, Page, and Char.
     25 
     26 Space optimization happens because multiple Planes can refer to the same
     27 Page array, and multiple Pages can refer to the same Char array holding
     28 the actual values. In practice, most of the higher planes are empty and
     29 thus share the same data; and within the BMP, there are also many pages
     30 that repeat the same data for any given property.
     31 
     32 Plane is usually zero, so we skip a lookup in this case, and require
     33 that the Plane 0 pages are always the first set of entries in the Page
     34 array.
     35 
     36 The division of the remaining 16 bits into Page and Char fields is
     37 adjusted for each property (by experiment using the generation tool)
     38 to provide the most compact storage, depending on the distribution
     39 of values.
     40 */
     41 
     42 const nsUGenCategory sDetailedToGeneralCategory[] = {
     43    // clang-format off
     44  /*
     45   * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
     46   * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
     47   */
     48  /* CONTROL */             nsUGenCategory::kOther,
     49  /* FORMAT */              nsUGenCategory::kOther,
     50  /* UNASSIGNED */          nsUGenCategory::kOther,
     51  /* PRIVATE_USE */         nsUGenCategory::kOther,
     52  /* SURROGATE */           nsUGenCategory::kOther,
     53  /* LOWERCASE_LETTER */    nsUGenCategory::kLetter,
     54  /* MODIFIER_LETTER */     nsUGenCategory::kLetter,
     55  /* OTHER_LETTER */        nsUGenCategory::kLetter,
     56  /* TITLECASE_LETTER */    nsUGenCategory::kLetter,
     57  /* UPPERCASE_LETTER */    nsUGenCategory::kLetter,
     58  /* COMBINING_MARK */      nsUGenCategory::kMark,
     59  /* ENCLOSING_MARK */      nsUGenCategory::kMark,
     60  /* NON_SPACING_MARK */    nsUGenCategory::kMark,
     61  /* DECIMAL_NUMBER */      nsUGenCategory::kNumber,
     62  /* LETTER_NUMBER */       nsUGenCategory::kNumber,
     63  /* OTHER_NUMBER */        nsUGenCategory::kNumber,
     64  /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
     65  /* DASH_PUNCTUATION */    nsUGenCategory::kPunctuation,
     66  /* CLOSE_PUNCTUATION */   nsUGenCategory::kPunctuation,
     67  /* FINAL_PUNCTUATION */   nsUGenCategory::kPunctuation,
     68  /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
     69  /* OTHER_PUNCTUATION */   nsUGenCategory::kPunctuation,
     70  /* OPEN_PUNCTUATION */    nsUGenCategory::kPunctuation,
     71  /* CURRENCY_SYMBOL */     nsUGenCategory::kSymbol,
     72  /* MODIFIER_SYMBOL */     nsUGenCategory::kSymbol,
     73  /* MATH_SYMBOL */         nsUGenCategory::kSymbol,
     74  /* OTHER_SYMBOL */        nsUGenCategory::kSymbol,
     75  /* LINE_SEPARATOR */      nsUGenCategory::kSeparator,
     76  /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
     77  /* SPACE_SEPARATOR */     nsUGenCategory::kSeparator
     78    // clang-format on
     79 };
     80 
     81 const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
     82    // clang-format off
     83  HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
     84  HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
     85  HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
     86  HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
     87  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
     88  HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
     89  HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
     90  HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
     91  HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
     92  HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
     93  HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
     94  HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
     95  HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
     96  HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
     97  HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
     98  HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
     99  HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
    100  HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
    101  HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
    102  HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
    103  HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
    104  HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
    105  HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
    106  HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
    107  HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
    108  HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
    109  HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
    110  HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
    111  HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
    112  HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
    113    // clang-format on
    114 };
    115 
    116 #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_)             \
    117  uint32_t Get##prefix_(uint32_t aCh) {                         \
    118    if (aCh >= UNICODE_BMP_LIMIT) {                             \
    119      return aCh;                                               \
    120    }                                                           \
    121    auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
    122    auto index = aCh & ((1 << k##prefix_##CharBits) - 1);       \
    123    uint32_t v = s##prefix_##Values[page][index];               \
    124    return v ? v : aCh;                                         \
    125  }
    126 
    127 // full-width mappings only exist for BMP characters; all others are
    128 // returned unchanged
    129 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
    130 DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
    131 
    132 bool IsClusterExtender(uint32_t aCh, uint8_t aCategory) {
    133  return (
    134      (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
    135       aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
    136      (aCh >= 0x200c && aCh <= 0x200d) ||    // ZWJ, ZWNJ
    137      (aCh >= 0xff9e && aCh <= 0xff9f) ||    // katakana sound marks
    138      (aCh >= 0x1F3FB && aCh <= 0x1F3FF) ||  // fitzpatrick skin tone modifiers
    139      (aCh >= 0xe0020 && aCh <= 0xe007f));   // emoji (flag) tag characters
    140 }
    141 
    142 bool IsClusterExtenderExcludingJoiners(uint32_t aCh, uint8_t aCategory) {
    143  return (
    144      (aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
    145       aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
    146      (aCh >= 0xff9e && aCh <= 0xff9f) ||    // katakana sound marks
    147      (aCh >= 0x1F3FB && aCh <= 0x1F3FF) ||  // fitzpatrick skin tone modifiers
    148      (aCh >= 0xe0020 && aCh <= 0xe007f));   // emoji (flag) tag characters
    149 }
    150 
    151 uint32_t CountGraphemeClusters(Span<const char16_t> aText) {
    152  if (aText.IsEmpty()) {
    153    // Fast path for empty text.
    154    return 0;
    155  }
    156  intl::GraphemeClusterBreakIteratorUtf16 iter(aText);
    157  uint32_t result = 0;
    158  while (iter.Next()) {
    159    ++result;
    160  }
    161  return result;
    162 }
    163 
    164 uint32_t GetNaked(uint32_t aCh) {
    165  uint32_t index = aCh >> 8;
    166  if (index >= std::size(BASE_CHAR_MAPPING_BLOCK_INDEX)) {
    167    return aCh;
    168  }
    169  index = BASE_CHAR_MAPPING_BLOCK_INDEX[index];
    170  if (index == 0xff) {
    171    return aCh;
    172  }
    173  const BaseCharMappingBlock& block = BASE_CHAR_MAPPING_BLOCKS[index];
    174  uint8_t lo = aCh & 0xff;
    175  if (lo < block.mFirst || lo > block.mLast) {
    176    return aCh;
    177  }
    178  return (aCh & 0xffff0000) |
    179         BASE_CHAR_MAPPING_LIST[block.mMappingStartOffset + lo - block.mFirst];
    180 }
    181 
    182 bool IsCombiningDiacritic(uint32_t aCh) {
    183  return sCombiningDiacriticsSet->test(aCh);
    184 }
    185 
    186 }  // end namespace unicode
    187 
    188 }  // end namespace mozilla