[ tor-browser ].git.dasho

nsTextFrameUtils.cpp (16201B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "nsTextFrameUtils.h"
      8 
      9 #include <algorithm>
     10 
     11 #include "mozilla/dom/CharacterDataBuffer.h"
     12 #include "mozilla/dom/Text.h"
     13 #include "nsBidiUtils.h"
     14 #include "nsCharTraits.h"
     15 #include "nsIContent.h"
     16 #include "nsStyleStruct.h"
     17 #include "nsUnicharUtils.h"
     18 #include "nsUnicodeProperties.h"
     19 
     20 using namespace mozilla;
     21 using namespace mozilla::dom;
     22 using namespace mozilla::unicode;
     23 
     24 // static
     25 bool nsTextFrameUtils::IsSpaceCombiningSequenceTail(const char16_t* aChars,
     26                                                    int32_t aLength) {
     27  return aLength > 0 &&
     28         (IsClusterExtenderExcludingJoiners(aChars[0]) ||
     29          (IsBidiControl(aChars[0]) &&
     30           IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)));
     31 }
     32 
     33 static bool IsDiscardable(char16_t ch, nsTextFrameUtils::Flags* aFlags) {
     34  // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by
     35  // gfxTextRun and discarding it would force us to copy text in many cases of
     36  // preformatted text containing \r\n.
     37  if (ch == CH_SHY) {
     38    *aFlags |= nsTextFrameUtils::Flags::HasShy;
     39    return true;
     40  }
     41  return IsBidiControl(ch);
     42 }
     43 
     44 static bool IsDiscardable(uint8_t ch, nsTextFrameUtils::Flags* aFlags) {
     45  if (ch == CH_SHY) {
     46    *aFlags |= nsTextFrameUtils::Flags::HasShy;
     47    return true;
     48  }
     49  return false;
     50 }
     51 
     52 static bool IsSegmentBreak(char16_t aCh) { return aCh == '\n'; }
     53 
     54 static bool IsSpaceOrTab(char16_t aCh) { return aCh == ' ' || aCh == '\t'; }
     55 
     56 static bool IsSpaceOrTabOrSegmentBreak(char16_t aCh) {
     57  return IsSpaceOrTab(aCh) || IsSegmentBreak(aCh);
     58 }
     59 
     60 template <typename CharT>
     61 /* static */
     62 bool nsTextFrameUtils::IsSkippableCharacterForTransformText(CharT aChar) {
     63  return aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == CH_SHY ||
     64         (aChar > 0xFF && IsBidiControl(aChar));
     65 }
     66 
     67 #ifdef DEBUG
     68 template <typename CharT>
     69 static void AssertSkippedExpectedChars(const CharT* aText,
     70                                       const gfxSkipChars& aSkipChars,
     71                                       int32_t aSkipCharsOffset) {
     72  gfxSkipCharsIterator it(aSkipChars);
     73  it.AdvanceOriginal(aSkipCharsOffset);
     74  while (it.GetOriginalOffset() < it.GetOriginalEnd()) {
     75    CharT ch = aText[it.GetOriginalOffset() - aSkipCharsOffset];
     76    MOZ_ASSERT(!it.IsOriginalCharSkipped() ||
     77                   nsTextFrameUtils::IsSkippableCharacterForTransformText(ch),
     78               "skipped unexpected character; need to update "
     79               "IsSkippableCharacterForTransformText?");
     80    it.AdvanceOriginal(1);
     81  }
     82 }
     83 #endif
     84 
     85 template <class CharT>
     86 static CharT* TransformWhiteSpaces(
     87    const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd,
     88    bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput,
     89    nsTextFrameUtils::Flags& aFlags,
     90    nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars,
     91    bool aLangIsJapaneseOrChinese) {
     92  MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
     93                 aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE,
     94             "whitespaces should be skippable!!");
     95  // Get the context preceding/following this white space range.
     96  // For 8-bit text (sizeof CharT == 1), the checks here should get optimized
     97  // out, and isSegmentBreakSkippable should be initialized to be 'false'.
     98  bool isSegmentBreakSkippable =
     99      sizeof(CharT) > 1 &&
    100      ((aBegin > 0 && IS_ZERO_WIDTH_SPACE(aText[aBegin - 1])) ||
    101       (aEnd < aLength && IS_ZERO_WIDTH_SPACE(aText[aEnd])));
    102  if (sizeof(CharT) > 1 && !isSegmentBreakSkippable && aBegin > 0 &&
    103      aEnd < aLength) {
    104    // Get the characters before and after the segment break, skipping past
    105    // any default-ignorable characters (e.g. variation selectors, various
    106    // invisible control chars, etc)
    107    uint32_t ucs4before, ucs4after;
    108    uint32_t pos = aBegin;
    109    do {
    110      if (pos > 1 && NS_IS_SURROGATE_PAIR(aText[pos - 2], aText[pos - 1])) {
    111        ucs4before = SURROGATE_TO_UCS4(aText[pos - 2], aText[pos - 1]);
    112        pos -= 2;
    113      } else {
    114        ucs4before = aText[pos - 1];
    115        pos -= 1;
    116      }
    117    } while (IsDefaultIgnorable(ucs4before) && pos > 0);
    118 
    119    pos = aEnd;
    120    do {
    121      if (pos + 1 < aLength &&
    122          NS_IS_SURROGATE_PAIR(aText[pos], aText[pos + 1])) {
    123        ucs4after = SURROGATE_TO_UCS4(aText[pos], aText[pos + 1]);
    124        pos += 2;
    125      } else {
    126        ucs4after = aText[pos];
    127        pos += 1;
    128      }
    129    } while (IsDefaultIgnorable(ucs4after) && pos < aLength);
    130 
    131    // Discard newlines between characters that have F, W, or H EastAsianWidth
    132    // property and neither side is Hangul.
    133    // For Japanese/Chinese, also discard if *either* character is a fullwidth/
    134    // wide punctuation character.
    135    isSegmentBreakSkippable =
    136        (IsSegmentBreakSkipChar(ucs4before) &&
    137         IsSegmentBreakSkipChar(ucs4after)) ||
    138        (aLangIsJapaneseOrChinese && (IsEastAsianPunctuation(ucs4before) ||
    139                                      IsEastAsianPunctuation(ucs4after)));
    140  }
    141 
    142  for (uint32_t i = aBegin; i < aEnd; ++i) {
    143    CharT ch = aText[i];
    144    bool keepChar = false;
    145    bool keepTransformedWhiteSpace = false;
    146    if (IsDiscardable(ch, &aFlags)) {
    147      aSkipChars->SkipChar();
    148      continue;
    149    }
    150    if (IsSpaceOrTab(ch)) {
    151      if (aHasSegmentBreak) {
    152        // If white-space is set to normal, nowrap, or pre-line, white space
    153        // characters are considered collapsible and all spaces and tabs
    154        // immediately preceding or following a segment break are removed.
    155        aSkipChars->SkipChar();
    156        continue;
    157      }
    158 
    159      if (aInWhitespace) {
    160        aSkipChars->SkipChar();
    161        continue;
    162      } else {
    163        keepTransformedWhiteSpace = true;
    164      }
    165    } else {
    166      // Apply Segment Break Transformation Rules (CSS Text 3 - 4.1.2) for
    167      // segment break characters.
    168      if (aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
    169          // XXX: According to CSS Text 3, a lone CR should not always be
    170          //      kept, but still go through the Segment Break Transformation
    171          //      Rules. However, this is what current modern browser engines
    172          //      (webkit/blink/edge) do. So, once we can get some clarity
    173          //      from the specification issue, we should either remove the
    174          //      lone CR condition here, or leave it here with this comment
    175          //      being rephrased.
    176          //      Please see https://github.com/w3c/csswg-drafts/issues/855.
    177          ch == '\r') {
    178        keepChar = true;
    179      } else {
    180        // aCompression == COMPRESS_WHITESPACE_NEWLINE
    181 
    182        // Any collapsible segment break immediately following another
    183        // collapsible segment break is removed.  Then the remaining segment
    184        // break is either transformed into a space (U+0020) or removed
    185        // depending on the context before and after the break.
    186        if (isSegmentBreakSkippable || aInWhitespace) {
    187          aSkipChars->SkipChar();
    188          continue;
    189        }
    190        isSegmentBreakSkippable = true;
    191        keepTransformedWhiteSpace = true;
    192      }
    193    }
    194 
    195    if (keepChar) {
    196      *aOutput++ = ch;
    197      aSkipChars->KeepChar();
    198      aInWhitespace = IsSpaceOrTab(ch);
    199    } else if (keepTransformedWhiteSpace) {
    200      *aOutput++ = ' ';
    201      aSkipChars->KeepChar();
    202      aInWhitespace = true;
    203    } else {
    204      MOZ_ASSERT_UNREACHABLE("Should've skipped the character!!");
    205    }
    206  }
    207  return aOutput;
    208 }
    209 
    210 template <class CharT>
    211 CharT* nsTextFrameUtils::TransformText(
    212    const CharT* aText, uint32_t aLength, CharT* aOutput,
    213    CompressionMode aCompression, uint8_t* aIncomingFlags,
    214    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage) {
    215  Flags flags = Flags();
    216 #ifdef DEBUG
    217  int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount();
    218 #endif
    219 
    220  bool lastCharArabic = false;
    221  if (aCompression == COMPRESS_NONE ||
    222      aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
    223    // Skip discardables.
    224    uint32_t i;
    225    for (i = 0; i < aLength; ++i) {
    226      CharT ch = aText[i];
    227      if (IsDiscardable(ch, &flags)) {
    228        aSkipChars->SkipChar();
    229      } else {
    230        aSkipChars->KeepChar();
    231        if (ch > ' ') {
    232          lastCharArabic = IS_ARABIC_CHAR(ch);
    233        } else if (aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
    234          if (ch == '\t' || ch == '\n') {
    235            ch = ' ';
    236          }
    237        } else {
    238          // aCompression == COMPRESS_NONE
    239          if (ch == '\t') {
    240            flags |= Flags::HasTab;
    241          } else if (ch == '\n') {
    242            flags |= Flags::HasNewline;
    243          }
    244        }
    245        *aOutput++ = ch;
    246      }
    247    }
    248    if (lastCharArabic) {
    249      *aIncomingFlags |= INCOMING_ARABICCHAR;
    250    } else {
    251      *aIncomingFlags &= ~INCOMING_ARABICCHAR;
    252    }
    253    *aIncomingFlags &= ~INCOMING_WHITESPACE;
    254  } else {
    255    bool langIsJapaneseOrChinese = [=]() {
    256      if (!aLanguage || aLanguage->GetLength() < 2) {
    257        return false;
    258      }
    259      const char16_t* text = aLanguage->GetUTF16String();
    260      if ((ToLowerCaseASCII(text[0]) == char16_t('j') &&
    261           ToLowerCaseASCII(text[1]) == char16_t('a')) ||
    262          (ToLowerCaseASCII(text[0]) == char16_t('z') &&
    263           ToLowerCaseASCII(text[1]) == char16_t('h'))) {
    264        return aLanguage->GetLength() == 2 || text[2] == '-';
    265      }
    266      return false;
    267    }();
    268    bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0;
    269    uint32_t i;
    270    for (i = 0; i < aLength; ++i) {
    271      CharT ch = aText[i];
    272      // CSS Text 3 - 4.1. The White Space Processing Rules
    273      // White space processing in CSS affects only the document white space
    274      // characters: spaces (U+0020), tabs (U+0009), and segment breaks.
    275      // Since we need the context of segment breaks and their surrounding
    276      // white spaces to proceed the white space processing, a consecutive run
    277      // of spaces/tabs/segment breaks is collected in a first pass loop, then
    278      // we apply the collapsing and transformation rules to this run in a
    279      // second pass loop.
    280      if (IsSpaceOrTabOrSegmentBreak(ch)) {
    281        bool keepLastSpace = false;
    282        bool hasSegmentBreak = IsSegmentBreak(ch);
    283        uint32_t countTrailingDiscardables = 0;
    284        uint32_t j;
    285        for (j = i + 1; j < aLength && (IsSpaceOrTabOrSegmentBreak(aText[j]) ||
    286                                        IsDiscardable(aText[j], &flags));
    287             j++) {
    288          if (IsSegmentBreak(aText[j])) {
    289            hasSegmentBreak = true;
    290          }
    291        }
    292        // Exclude trailing discardables before checking space combining
    293        // sequence tail.
    294        for (; IsDiscardable(aText[j - 1], &flags); j--) {
    295          countTrailingDiscardables++;
    296        }
    297        // If the last white space is followed by a combining sequence tail,
    298        // exclude it from the range of TransformWhiteSpaces.
    299        if (sizeof(CharT) > 1 && aText[j - 1] == ' ' && j < aLength &&
    300            IsSpaceCombiningSequenceTail(&aText[j], aLength - j)) {
    301          keepLastSpace = true;
    302          j--;
    303        }
    304        if (j > i) {
    305          aOutput = TransformWhiteSpaces(
    306              aText, aLength, i, j, hasSegmentBreak, inWhitespace, aOutput,
    307              flags, aCompression, aSkipChars, langIsJapaneseOrChinese);
    308        }
    309        // We need to keep KeepChar()/SkipChar() in order, so process the
    310        // last white space first, then process the trailing discardables.
    311        if (keepLastSpace) {
    312          keepLastSpace = false;
    313          *aOutput++ = ' ';
    314          aSkipChars->KeepChar();
    315          lastCharArabic = false;
    316          j++;
    317        }
    318        for (; countTrailingDiscardables > 0; countTrailingDiscardables--) {
    319          aSkipChars->SkipChar();
    320          j++;
    321        }
    322        i = j - 1;
    323        continue;
    324      }
    325      // Process characters other than the document white space characters.
    326      if (IsDiscardable(ch, &flags)) {
    327        aSkipChars->SkipChar();
    328      } else {
    329        *aOutput++ = ch;
    330        aSkipChars->KeepChar();
    331      }
    332      lastCharArabic = IS_ARABIC_CHAR(ch);
    333      inWhitespace = false;
    334    }
    335 
    336    if (lastCharArabic) {
    337      *aIncomingFlags |= INCOMING_ARABICCHAR;
    338    } else {
    339      *aIncomingFlags &= ~INCOMING_ARABICCHAR;
    340    }
    341    if (inWhitespace) {
    342      *aIncomingFlags |= INCOMING_WHITESPACE;
    343    } else {
    344      *aIncomingFlags &= ~INCOMING_WHITESPACE;
    345    }
    346  }
    347 
    348  *aAnalysisFlags = flags;
    349 
    350 #ifdef DEBUG
    351  AssertSkippedExpectedChars(aText, *aSkipChars, skipCharsOffset);
    352 #endif
    353  return aOutput;
    354 }
    355 
    356 /*
    357 * NOTE: The TransformText and IsSkippableCharacterForTransformText template
    358 * functions are part of the public API of nsTextFrameUtils, while
    359 * their function bodies are not available in the header. They may stop working
    360 * (fail to resolve symbol in link time) once their callsites are moved to a
    361 * different translation unit (e.g. a different unified source file).
    362 * Explicit instantiating this function template with `uint8_t` and `char16_t`
    363 * could prevent us from the potential risk.
    364 */
    365 template uint8_t* nsTextFrameUtils::TransformText(
    366    const uint8_t* aText, uint32_t aLength, uint8_t* aOutput,
    367    CompressionMode aCompression, uint8_t* aIncomingFlags,
    368    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
    369 template char16_t* nsTextFrameUtils::TransformText(
    370    const char16_t* aText, uint32_t aLength, char16_t* aOutput,
    371    CompressionMode aCompression, uint8_t* aIncomingFlags,
    372    gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage);
    373 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
    374    uint8_t aChar);
    375 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
    376    char16_t aChar);
    377 
    378 template <typename CharT>
    379 static uint32_t DoComputeApproximateLengthWithWhitespaceCompression(
    380    const CharT* aChars, uint32_t aLength, const nsStyleText* aStyleText) {
    381  // This is an approximation so we don't really need anything
    382  // too fancy here.
    383  uint32_t len;
    384  if (aStyleText->WhiteSpaceIsSignificant()) {
    385    return aLength;
    386  }
    387  bool prevWS = true;  // more important to ignore blocks with
    388                       // only whitespace than get inline boundaries
    389                       // exactly right
    390  len = 0;
    391  for (uint32_t i = 0; i < aLength; ++i) {
    392    CharT c = aChars[i];
    393    if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
    394      if (!prevWS) {
    395        ++len;
    396      }
    397      prevWS = true;
    398    } else {
    399      ++len;
    400      prevWS = false;
    401    }
    402  }
    403  return len;
    404 }
    405 
    406 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
    407    Text* aText, const nsStyleText* aStyleText) {
    408  const CharacterDataBuffer* characterDataBuffer = &aText->DataBuffer();
    409  if (characterDataBuffer->Is2b()) {
    410    return DoComputeApproximateLengthWithWhitespaceCompression(
    411        characterDataBuffer->Get2b(), characterDataBuffer->GetLength(),
    412        aStyleText);
    413  }
    414  return DoComputeApproximateLengthWithWhitespaceCompression(
    415      characterDataBuffer->Get1b(), characterDataBuffer->GetLength(),
    416      aStyleText);
    417 }
    418 
    419 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
    420    const nsAString& aString, const nsStyleText* aStyleText) {
    421  return DoComputeApproximateLengthWithWhitespaceCompression(
    422      aString.BeginReading(), aString.Length(), aStyleText);
    423 }
    424 
    425 bool nsSkipCharsRunIterator::NextRun() {
    426  do {
    427    if (mRunLength) {
    428      mIterator.AdvanceOriginal(mRunLength);
    429      NS_ASSERTION(mRunLength > 0,
    430                   "No characters in run (initial length too large?)");
    431      if (!mSkipped || mLengthIncludesSkipped) {
    432        mRemainingLength -= mRunLength;
    433      }
    434    }
    435    if (!mRemainingLength) {
    436      return false;
    437    }
    438    int32_t length;
    439    mSkipped = mIterator.IsOriginalCharSkipped(&length);
    440    mRunLength = std::min(length, mRemainingLength);
    441  } while (!mVisitSkipped && mSkipped);
    442 
    443  return true;
    444 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE