tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsUnicharUtils.cpp (18039B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #include "nsUnicharUtils.h"
      7 #include "nsUnicodeProperties.h"
      8 #include "nsUTF8Utils.h"
      9 #include "mozilla/Likely.h"
     10 #include "mozilla/HashFunctions.h"
     11 #include "mozilla/intl/UnicodeProperties.h"
     12 #include "mozilla/StaticPrefs_layout.h"
     13 
     14 // We map x -> x, except for upper-case letters,
     15 // which we map to their lower-case equivalents.
     16 static const uint8_t gASCIIToLower[128] = {
     17    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
     18    0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
     19    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
     20    0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
     21    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
     22    0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
     23    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73,
     24    0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
     25    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
     26    0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
     27    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
     28 };
     29 
     30 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
     31 // when they're called from within the case-insensitive comparators, so we
     32 // define inlined versions.
     33 static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) {
     34  if (IS_ASCII(aChar)) {
     35    return gASCIIToLower[aChar];
     36  }
     37 
     38  return mozilla::intl::UnicodeProperties::ToLower(aChar);
     39 }
     40 
     41 static MOZ_ALWAYS_INLINE uint32_t
     42 ToLowerCaseASCII_inline(const uint32_t aChar) {
     43  if (IS_ASCII(aChar)) {
     44    return gASCIIToLower[aChar];
     45  }
     46 
     47  return aChar;
     48 }
     49 
     50 void ToLowerCase(nsAString& aString) {
     51  char16_t* buf = aString.BeginWriting();
     52  ToLowerCase(buf, buf, aString.Length());
     53 }
     54 
     55 void ToLowerCaseASCII(nsAString& aString) {
     56  char16_t* buf = aString.BeginWriting();
     57  ToLowerCaseASCII(buf, buf, aString.Length());
     58 }
     59 
     60 char ToLowerCaseASCII(char aChar) {
     61  if (aChar >= 'A' && aChar <= 'Z') {
     62    return aChar + 0x20;
     63  }
     64  return aChar;
     65 }
     66 
     67 char16_t ToLowerCaseASCII(char16_t aChar) {
     68  if (aChar >= 'A' && aChar <= 'Z') {
     69    return aChar + 0x20;
     70  }
     71  return aChar;
     72 }
     73 
     74 char32_t ToLowerCaseASCII(char32_t aChar) {
     75  if (aChar >= 'A' && aChar <= 'Z') {
     76    return aChar + 0x20;
     77  }
     78  return aChar;
     79 }
     80 
     81 char ToUpperCaseASCII(char aChar) {
     82  if (aChar >= 'a' && aChar <= 'z') {
     83    return aChar - 0x20;
     84  }
     85  return aChar;
     86 }
     87 
     88 char16_t ToUpperCaseASCII(char16_t aChar) {
     89  if (aChar >= 'a' && aChar <= 'z') {
     90    return aChar - 0x20;
     91  }
     92  return aChar;
     93 }
     94 
     95 char32_t ToUpperCaseASCII(char32_t aChar) {
     96  if (aChar >= 'a' && aChar <= 'z') {
     97    return aChar - 0x20;
     98  }
     99  return aChar;
    100 }
    101 
    102 void ToLowerCase(const nsAString& aSource, nsAString& aDest) {
    103  const char16_t* in = aSource.BeginReading();
    104  size_t len = aSource.Length();
    105 
    106  aDest.SetLength(len);
    107  char16_t* out = aDest.BeginWriting();
    108 
    109  ToLowerCase(in, out, len);
    110 }
    111 
    112 void ToLowerCaseASCII(const nsAString& aSource, nsAString& aDest) {
    113  const char16_t* in = aSource.BeginReading();
    114  size_t len = aSource.Length();
    115 
    116  aDest.SetLength(len);
    117  char16_t* out = aDest.BeginWriting();
    118 
    119  ToLowerCaseASCII(in, out, len);
    120 }
    121 
    122 uint32_t ToLowerCaseASCII(const uint32_t aChar) {
    123  return ToLowerCaseASCII_inline(aChar);
    124 }
    125 
    126 void ToUpperCase(nsAString& aString) {
    127  char16_t* buf = aString.BeginWriting();
    128  ToUpperCase(buf, buf, aString.Length());
    129 }
    130 
    131 void ToUpperCase(const nsAString& aSource, nsAString& aDest) {
    132  const char16_t* in = aSource.BeginReading();
    133  size_t len = aSource.Length();
    134 
    135  aDest.SetLength(len);
    136  char16_t* out = aDest.BeginWriting();
    137 
    138  ToUpperCase(in, out, len);
    139 }
    140 
    141 #ifdef MOZILLA_INTERNAL_API
    142 
    143 uint32_t ToFoldedCase(uint32_t aChar) {
    144  if (IS_ASCII(aChar)) return gASCIIToLower[aChar];
    145  return mozilla::unicode::GetFoldedcase(aChar);
    146 }
    147 
    148 void ToFoldedCase(nsAString& aString) {
    149  char16_t* buf = aString.BeginWriting();
    150  ToFoldedCase(buf, buf, aString.Length());
    151 }
    152 
    153 void ToFoldedCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
    154  for (uint32_t i = 0; i < aLen; i++) {
    155    uint32_t ch = aIn[i];
    156    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
    157      ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
    158      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
    159      aOut[i++] = H_SURROGATE(ch);
    160      aOut[i] = L_SURROGATE(ch);
    161      continue;
    162    }
    163    aOut[i] = ToFoldedCase(ch);
    164  }
    165 }
    166 
    167 uint32_t ToNaked(uint32_t aChar) {
    168  if (IS_ASCII(aChar)) {
    169    return aChar;
    170  }
    171  return mozilla::unicode::GetNaked(aChar);
    172 }
    173 
    174 void ToNaked(nsAString& aString) {
    175  uint32_t i = 0;
    176  while (i < aString.Length()) {
    177    uint32_t ch = aString[i];
    178    if (i < aString.Length() - 1 && NS_IS_SURROGATE_PAIR(ch, aString[i + 1])) {
    179      ch = SURROGATE_TO_UCS4(ch, aString[i + 1]);
    180      if (mozilla::unicode::IsCombiningDiacritic(ch)) {
    181        aString.Cut(i, 2);
    182      } else {
    183        ch = mozilla::unicode::GetNaked(ch);
    184        NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!");
    185        aString.Replace(i++, 1, H_SURROGATE(ch));
    186        aString.Replace(i++, 1, L_SURROGATE(ch));
    187      }
    188      continue;
    189    }
    190    if (mozilla::unicode::IsCombiningDiacritic(ch)) {
    191      aString.Cut(i, 1);
    192    } else {
    193      aString.Replace(i++, 1, ToNaked(ch));
    194    }
    195  }
    196 }
    197 
    198 int32_t nsCaseInsensitiveStringComparator(const char16_t* lhs,
    199                                          const char16_t* rhs, size_t lLength,
    200                                          size_t rLength) {
    201  return (lLength == rLength)  ? CaseInsensitiveCompare(lhs, rhs, lLength)
    202         : (lLength > rLength) ? 1
    203                               : -1;
    204 }
    205 
    206 int32_t nsCaseInsensitiveUTF8StringComparator(const char* lhs, const char* rhs,
    207                                              size_t lLength, size_t rLength) {
    208  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
    209 }
    210 
    211 int32_t nsASCIICaseInsensitiveStringComparator(const char16_t* lhs,
    212                                               const char16_t* rhs,
    213                                               size_t lLength, size_t rLength) {
    214  if (lLength != rLength) {
    215    if (lLength > rLength) return 1;
    216    return -1;
    217  }
    218 
    219  while (rLength) {
    220    // we don't care about surrogates here, because we're only
    221    // lowercasing the ASCII range
    222    char16_t l = *lhs++;
    223    char16_t r = *rhs++;
    224    if (l != r) {
    225      l = ToLowerCaseASCII_inline(l);
    226      r = ToLowerCaseASCII_inline(r);
    227 
    228      if (l > r)
    229        return 1;
    230      else if (r > l)
    231        return -1;
    232    }
    233    rLength--;
    234  }
    235 
    236  return 0;
    237 }
    238 
    239 #endif  // MOZILLA_INTERNAL_API
    240 
    241 uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); }
    242 
    243 void ToLowerCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
    244  for (size_t i = 0; i < aLen; i++) {
    245    uint32_t ch = aIn[i];
    246    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
    247      ch = mozilla::intl::UnicodeProperties::ToLower(
    248          SURROGATE_TO_UCS4(ch, aIn[i + 1]));
    249      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
    250      aOut[i++] = H_SURROGATE(ch);
    251      aOut[i] = L_SURROGATE(ch);
    252      continue;
    253    }
    254    aOut[i] = ToLowerCase(ch);
    255  }
    256 }
    257 
    258 void ToLowerCaseASCII(const char16_t* aIn, char16_t* aOut, size_t aLen) {
    259  for (size_t i = 0; i < aLen; i++) {
    260    char16_t ch = aIn[i];
    261    aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch;
    262  }
    263 }
    264 
    265 uint32_t ToUpperCase(uint32_t aChar) {
    266  if (IS_ASCII(aChar)) {
    267    if (IS_ASCII_LOWER(aChar)) {
    268      return aChar - 0x20;
    269    }
    270    return aChar;
    271  }
    272 
    273  return mozilla::intl::UnicodeProperties::ToUpper(aChar);
    274 }
    275 
    276 void ToUpperCase(const char16_t* aIn, char16_t* aOut, size_t aLen) {
    277  for (size_t i = 0; i < aLen; i++) {
    278    uint32_t ch = aIn[i];
    279    if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) {
    280      ch = mozilla::intl::UnicodeProperties::ToUpper(
    281          SURROGATE_TO_UCS4(ch, aIn[i + 1]));
    282      NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
    283      aOut[i++] = H_SURROGATE(ch);
    284      aOut[i] = L_SURROGATE(ch);
    285      continue;
    286    }
    287    aOut[i] = ToUpperCase(ch);
    288  }
    289 }
    290 
    291 uint32_t ToTitleCase(uint32_t aChar) {
    292  if (IS_ASCII(aChar)) {
    293    return ToUpperCase(aChar);
    294  }
    295 
    296  return mozilla::unicode::GetTitlecaseForLower(aChar);
    297 }
    298 
    299 int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b,
    300                               size_t len) {
    301  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
    302 
    303  if (len) {
    304    do {
    305      uint32_t c1 = *a++;
    306      uint32_t c2 = *b++;
    307 
    308      // Unfortunately, we need to check for surrogates BEFORE we check
    309      // for equality, because we could have identical high surrogates
    310      // but non-identical characters, so we can't just skip them
    311 
    312      // If c1 isn't a surrogate, we don't bother to check c2;
    313      // in the case where it _is_ a surrogate, we're definitely going to get
    314      // a mismatch, and don't need to interpret and lowercase it
    315 
    316      if (len > 1 && NS_IS_SURROGATE_PAIR(c1, *a)) {
    317        c1 = SURROGATE_TO_UCS4(c1, *a++);
    318        if (NS_IS_SURROGATE_PAIR(c2, *b)) {
    319          c2 = SURROGATE_TO_UCS4(c2, *b++);
    320        }
    321        // If c2 wasn't a surrogate, decrementing len means we'd stop
    322        // short of the end of string b, but that doesn't actually matter
    323        // because we're going to find a mismatch and return early
    324        --len;
    325      }
    326 
    327      if (c1 != c2) {
    328        c1 = ToLowerCase_inline(c1);
    329        c2 = ToLowerCase_inline(c2);
    330        if (c1 != c2) {
    331          if (c1 < c2) {
    332            return -1;
    333          }
    334          return 1;
    335        }
    336      }
    337    } while (--len != 0);
    338  }
    339  return 0;
    340 }
    341 
    342 // Inlined definition of GetLowerUTF8Codepoint, which we use because we want
    343 // to be fast when called from the case-insensitive comparators.
    344 static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline(
    345    const char* aStr, const char* aEnd, const char** aNext) {
    346  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
    347  // sign extend.
    348  const unsigned char* str = (unsigned char*)aStr;
    349 
    350  if (UTF8traits::isASCII(str[0])) {
    351    // It's ASCII; just convert to lower-case and return it.
    352    *aNext = aStr + 1;
    353    return gASCIIToLower[*str];
    354  }
    355  if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
    356    // It's a two-byte sequence, so it looks like
    357    //  110XXXXX 10XXXXXX.
    358    // This is definitely in the BMP, so we can store straightaway into a
    359    // uint16_t.
    360 
    361    uint16_t c;
    362    c = (str[0] & 0x1F) << 6;
    363    c += (str[1] & 0x3F);
    364 
    365    // we don't go through ToLowerCase here, because we know this isn't
    366    // an ASCII character so the ASCII fast-path there is useless
    367    c = mozilla::intl::UnicodeProperties::ToLower(c);
    368 
    369    *aNext = aStr + 2;
    370    return c;
    371  }
    372  if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
    373    // It's a three-byte sequence, so it looks like
    374    //  1110XXXX 10XXXXXX 10XXXXXX.
    375    // This will just barely fit into 16-bits, so store into a uint16_t.
    376 
    377    uint16_t c;
    378    c = (str[0] & 0x0F) << 12;
    379    c += (str[1] & 0x3F) << 6;
    380    c += (str[2] & 0x3F);
    381 
    382    c = mozilla::intl::UnicodeProperties::ToLower(c);
    383 
    384    *aNext = aStr + 3;
    385    return c;
    386  }
    387  if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
    388    // It's a four-byte sequence, so it looks like
    389    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
    390 
    391    uint32_t c;
    392    c = (str[0] & 0x07) << 18;
    393    c += (str[1] & 0x3F) << 12;
    394    c += (str[2] & 0x3F) << 6;
    395    c += (str[3] & 0x3F);
    396 
    397    c = mozilla::intl::UnicodeProperties::ToLower(c);
    398 
    399    *aNext = aStr + 4;
    400    return c;
    401  }
    402 
    403  // Hm, we don't understand this sequence.
    404  return -1;
    405 }
    406 
    407 uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd,
    408                               const char** aNext) {
    409  return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
    410 }
    411 
    412 int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight,
    413                               size_t aLeftBytes, size_t aRightBytes) {
    414  const char* leftEnd = aLeft + aLeftBytes;
    415  const char* rightEnd = aRight + aRightBytes;
    416 
    417  while (aLeft < leftEnd && aRight < rightEnd) {
    418    uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft);
    419    if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1;
    420 
    421    uint32_t rightChar =
    422        GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight);
    423    if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1;
    424 
    425    // Now leftChar and rightChar are lower-case, so we can compare them.
    426    if (leftChar != rightChar) {
    427      if (leftChar > rightChar) return 1;
    428      return -1;
    429    }
    430  }
    431 
    432  // Make sure that if one string is longer than the other we return the
    433  // correct result.
    434  if (aLeft < leftEnd) return 1;
    435  if (aRight < rightEnd) return -1;
    436 
    437  return 0;
    438 }
    439 
    440 static MOZ_ALWAYS_INLINE uint32_t
    441 GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd,
    442                             const char** aNext, bool aMatchDiacritics) {
    443  uint32_t c;
    444  for (;;) {
    445    c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
    446    if (aMatchDiacritics) {
    447      break;
    448    }
    449    if (!mozilla::unicode::IsCombiningDiacritic(c)) {
    450      break;
    451    }
    452    aStr = *aNext;
    453  }
    454  return c;
    455 }
    456 
    457 bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
    458                                   const char* aLeftEnd, const char* aRightEnd,
    459                                   const char** aLeftNext,
    460                                   const char** aRightNext, bool* aErr,
    461                                   bool aMatchDiacritics) {
    462  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
    463  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
    464  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
    465  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
    466  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
    467 
    468  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext,
    469                                                   aMatchDiacritics);
    470  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
    471    *aErr = true;
    472    return false;
    473  }
    474 
    475  uint32_t rightChar = GetLowerUTF8Codepoint_inline(
    476      aRight, aRightEnd, aRightNext, aMatchDiacritics);
    477  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
    478    *aErr = true;
    479    return false;
    480  }
    481 
    482  // Can't have an error past this point.
    483  *aErr = false;
    484 
    485  if (!aMatchDiacritics) {
    486    leftChar = ToNaked(leftChar);
    487    rightChar = ToNaked(rightChar);
    488  }
    489 
    490  return leftChar == rightChar;
    491 }
    492 
    493 namespace mozilla {
    494 
    495 uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr) {
    496  uint32_t hash = 0;
    497  const char* s = aUTF8;
    498  const char* end = aUTF8 + aLength;
    499 
    500  *aErr = false;
    501 
    502  while (s < end) {
    503    uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
    504    if (*aErr) {
    505      return 0;
    506    }
    507 
    508    if (ucs4 < PLANE1_BASE) {
    509      hash = AddToHash(hash, ucs4);
    510    } else {
    511      hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
    512    }
    513  }
    514 
    515  return hash;
    516 }
    517 
    518 // The Korean Won currency sign has East Asian Width = HALFWIDTH, and
    519 // Script = COMMON (rather than HANGUL), but we don't want to treat it like
    520 // Chinese/Japanese half-width characters for segment break transformation,
    521 // so we exclude it individually in the two functions here.
    522 static constexpr uint32_t kWonCurrencySign = 0x20A9;
    523 
    524 bool IsSegmentBreakSkipChar(uint32_t u) {
    525  return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) &&
    526         intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL &&
    527         u != kWonCurrencySign;
    528 }
    529 
    530 bool IsEastAsianPunctuation(uint32_t u) {
    531  // U+FF5E FULLWIDTH TILDE has General Category = Symbol (not Punctuation),
    532  // but is used similarly to U+301C WAVE DASH (which does have category
    533  // Punctuation). So we treat FULLWIDTH TILDE as punctuation here to give the
    534  // two characters consistent behavior.
    535  constexpr uint32_t kFullwidthTilde = 0xFF5E;
    536  // U+3000 IDEOGRAPHIC SPACE has General Category = Zs (not Punctuation),
    537  // but it conflicts with a JLReq rule that space added after
    538  // question or exclamation mark is stipulated to be full-width if line is
    539  // broken after full-width space following such a punctuation mark but
    540  // line break is replaced by a space. So we treat IDEOGRAPHIC SPACE as
    541  // punctuation here to allow line breaks after it while maintaining
    542  // compatibility with JLReq.
    543  constexpr uint32_t kIdeographicSpace = 0x3000;
    544  return intl::UnicodeProperties::IsEastAsianWidthFHW(u) &&
    545         ((intl::UnicodeProperties::IsPunctuation(u) &&
    546           u != kWonCurrencySign) ||
    547          u == kFullwidthTilde || u == kIdeographicSpace);
    548 }
    549 
    550 bool IsPunctuationForWordSelect(char16_t aCh) {
    551  const uint8_t cat = unicode::GetGeneralCategory(aCh);
    552  switch (cat) {
    553    case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */
    554      if (aCh == '_' && !StaticPrefs::layout_word_select_stop_at_underscore()) {
    555        return false;
    556      }
    557      [[fallthrough]];
    558    case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION:    /* Pd */
    559    case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION:   /* Pe */
    560    case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION:   /* Pf */
    561    case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */
    562    case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION:   /* Po */
    563    case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION:    /* Ps */
    564    case HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL:     /* Sc */
    565    // Deliberately omitted:
    566    // case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL:     /* Sk */
    567    case HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL:  /* Sm */
    568    case HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: /* So */
    569      return true;
    570    default:
    571      return false;
    572  }
    573 }
    574 
    575 }  // namespace mozilla