nsTextFrameUtils.cpp (16201B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "nsTextFrameUtils.h" 8 9 #include <algorithm> 10 11 #include "mozilla/dom/CharacterDataBuffer.h" 12 #include "mozilla/dom/Text.h" 13 #include "nsBidiUtils.h" 14 #include "nsCharTraits.h" 15 #include "nsIContent.h" 16 #include "nsStyleStruct.h" 17 #include "nsUnicharUtils.h" 18 #include "nsUnicodeProperties.h" 19 20 using namespace mozilla; 21 using namespace mozilla::dom; 22 using namespace mozilla::unicode; 23 24 // static 25 bool nsTextFrameUtils::IsSpaceCombiningSequenceTail(const char16_t* aChars, 26 int32_t aLength) { 27 return aLength > 0 && 28 (IsClusterExtenderExcludingJoiners(aChars[0]) || 29 (IsBidiControl(aChars[0]) && 30 IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1))); 31 } 32 33 static bool IsDiscardable(char16_t ch, nsTextFrameUtils::Flags* aFlags) { 34 // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by 35 // gfxTextRun and discarding it would force us to copy text in many cases of 36 // preformatted text containing \r\n. 37 if (ch == CH_SHY) { 38 *aFlags |= nsTextFrameUtils::Flags::HasShy; 39 return true; 40 } 41 return IsBidiControl(ch); 42 } 43 44 static bool IsDiscardable(uint8_t ch, nsTextFrameUtils::Flags* aFlags) { 45 if (ch == CH_SHY) { 46 *aFlags |= nsTextFrameUtils::Flags::HasShy; 47 return true; 48 } 49 return false; 50 } 51 52 static bool IsSegmentBreak(char16_t aCh) { return aCh == '\n'; } 53 54 static bool IsSpaceOrTab(char16_t aCh) { return aCh == ' ' || aCh == '\t'; } 55 56 static bool IsSpaceOrTabOrSegmentBreak(char16_t aCh) { 57 return IsSpaceOrTab(aCh) || IsSegmentBreak(aCh); 58 } 59 60 template <typename CharT> 61 /* static */ 62 bool nsTextFrameUtils::IsSkippableCharacterForTransformText(CharT aChar) { 63 return aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == CH_SHY || 64 (aChar > 0xFF && IsBidiControl(aChar)); 65 } 66 67 #ifdef DEBUG 68 template <typename CharT> 69 static void AssertSkippedExpectedChars(const CharT* aText, 70 const gfxSkipChars& aSkipChars, 71 int32_t aSkipCharsOffset) { 72 gfxSkipCharsIterator it(aSkipChars); 73 it.AdvanceOriginal(aSkipCharsOffset); 74 while (it.GetOriginalOffset() < it.GetOriginalEnd()) { 75 CharT ch = aText[it.GetOriginalOffset() - aSkipCharsOffset]; 76 MOZ_ASSERT(!it.IsOriginalCharSkipped() || 77 nsTextFrameUtils::IsSkippableCharacterForTransformText(ch), 78 "skipped unexpected character; need to update " 79 "IsSkippableCharacterForTransformText?"); 80 it.AdvanceOriginal(1); 81 } 82 } 83 #endif 84 85 template <class CharT> 86 static CharT* TransformWhiteSpaces( 87 const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd, 88 bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput, 89 nsTextFrameUtils::Flags& aFlags, 90 nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars, 91 bool aLangIsJapaneseOrChinese) { 92 MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE || 93 aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE, 94 "whitespaces should be skippable!!"); 95 // Get the context preceding/following this white space range. 96 // For 8-bit text (sizeof CharT == 1), the checks here should get optimized 97 // out, and isSegmentBreakSkippable should be initialized to be 'false'. 98 bool isSegmentBreakSkippable = 99 sizeof(CharT) > 1 && 100 ((aBegin > 0 && IS_ZERO_WIDTH_SPACE(aText[aBegin - 1])) || 101 (aEnd < aLength && IS_ZERO_WIDTH_SPACE(aText[aEnd]))); 102 if (sizeof(CharT) > 1 && !isSegmentBreakSkippable && aBegin > 0 && 103 aEnd < aLength) { 104 // Get the characters before and after the segment break, skipping past 105 // any default-ignorable characters (e.g. variation selectors, various 106 // invisible control chars, etc) 107 uint32_t ucs4before, ucs4after; 108 uint32_t pos = aBegin; 109 do { 110 if (pos > 1 && NS_IS_SURROGATE_PAIR(aText[pos - 2], aText[pos - 1])) { 111 ucs4before = SURROGATE_TO_UCS4(aText[pos - 2], aText[pos - 1]); 112 pos -= 2; 113 } else { 114 ucs4before = aText[pos - 1]; 115 pos -= 1; 116 } 117 } while (IsDefaultIgnorable(ucs4before) && pos > 0); 118 119 pos = aEnd; 120 do { 121 if (pos + 1 < aLength && 122 NS_IS_SURROGATE_PAIR(aText[pos], aText[pos + 1])) { 123 ucs4after = SURROGATE_TO_UCS4(aText[pos], aText[pos + 1]); 124 pos += 2; 125 } else { 126 ucs4after = aText[pos]; 127 pos += 1; 128 } 129 } while (IsDefaultIgnorable(ucs4after) && pos < aLength); 130 131 // Discard newlines between characters that have F, W, or H EastAsianWidth 132 // property and neither side is Hangul. 133 // For Japanese/Chinese, also discard if *either* character is a fullwidth/ 134 // wide punctuation character. 135 isSegmentBreakSkippable = 136 (IsSegmentBreakSkipChar(ucs4before) && 137 IsSegmentBreakSkipChar(ucs4after)) || 138 (aLangIsJapaneseOrChinese && (IsEastAsianPunctuation(ucs4before) || 139 IsEastAsianPunctuation(ucs4after))); 140 } 141 142 for (uint32_t i = aBegin; i < aEnd; ++i) { 143 CharT ch = aText[i]; 144 bool keepChar = false; 145 bool keepTransformedWhiteSpace = false; 146 if (IsDiscardable(ch, &aFlags)) { 147 aSkipChars->SkipChar(); 148 continue; 149 } 150 if (IsSpaceOrTab(ch)) { 151 if (aHasSegmentBreak) { 152 // If white-space is set to normal, nowrap, or pre-line, white space 153 // characters are considered collapsible and all spaces and tabs 154 // immediately preceding or following a segment break are removed. 155 aSkipChars->SkipChar(); 156 continue; 157 } 158 159 if (aInWhitespace) { 160 aSkipChars->SkipChar(); 161 continue; 162 } else { 163 keepTransformedWhiteSpace = true; 164 } 165 } else { 166 // Apply Segment Break Transformation Rules (CSS Text 3 - 4.1.2) for 167 // segment break characters. 168 if (aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE || 169 // XXX: According to CSS Text 3, a lone CR should not always be 170 // kept, but still go through the Segment Break Transformation 171 // Rules. However, this is what current modern browser engines 172 // (webkit/blink/edge) do. So, once we can get some clarity 173 // from the specification issue, we should either remove the 174 // lone CR condition here, or leave it here with this comment 175 // being rephrased. 176 // Please see https://github.com/w3c/csswg-drafts/issues/855. 177 ch == '\r') { 178 keepChar = true; 179 } else { 180 // aCompression == COMPRESS_WHITESPACE_NEWLINE 181 182 // Any collapsible segment break immediately following another 183 // collapsible segment break is removed. Then the remaining segment 184 // break is either transformed into a space (U+0020) or removed 185 // depending on the context before and after the break. 186 if (isSegmentBreakSkippable || aInWhitespace) { 187 aSkipChars->SkipChar(); 188 continue; 189 } 190 isSegmentBreakSkippable = true; 191 keepTransformedWhiteSpace = true; 192 } 193 } 194 195 if (keepChar) { 196 *aOutput++ = ch; 197 aSkipChars->KeepChar(); 198 aInWhitespace = IsSpaceOrTab(ch); 199 } else if (keepTransformedWhiteSpace) { 200 *aOutput++ = ' '; 201 aSkipChars->KeepChar(); 202 aInWhitespace = true; 203 } else { 204 MOZ_ASSERT_UNREACHABLE("Should've skipped the character!!"); 205 } 206 } 207 return aOutput; 208 } 209 210 template <class CharT> 211 CharT* nsTextFrameUtils::TransformText( 212 const CharT* aText, uint32_t aLength, CharT* aOutput, 213 CompressionMode aCompression, uint8_t* aIncomingFlags, 214 gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage) { 215 Flags flags = Flags(); 216 #ifdef DEBUG 217 int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount(); 218 #endif 219 220 bool lastCharArabic = false; 221 if (aCompression == COMPRESS_NONE || 222 aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) { 223 // Skip discardables. 224 uint32_t i; 225 for (i = 0; i < aLength; ++i) { 226 CharT ch = aText[i]; 227 if (IsDiscardable(ch, &flags)) { 228 aSkipChars->SkipChar(); 229 } else { 230 aSkipChars->KeepChar(); 231 if (ch > ' ') { 232 lastCharArabic = IS_ARABIC_CHAR(ch); 233 } else if (aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) { 234 if (ch == '\t' || ch == '\n') { 235 ch = ' '; 236 } 237 } else { 238 // aCompression == COMPRESS_NONE 239 if (ch == '\t') { 240 flags |= Flags::HasTab; 241 } else if (ch == '\n') { 242 flags |= Flags::HasNewline; 243 } 244 } 245 *aOutput++ = ch; 246 } 247 } 248 if (lastCharArabic) { 249 *aIncomingFlags |= INCOMING_ARABICCHAR; 250 } else { 251 *aIncomingFlags &= ~INCOMING_ARABICCHAR; 252 } 253 *aIncomingFlags &= ~INCOMING_WHITESPACE; 254 } else { 255 bool langIsJapaneseOrChinese = [=]() { 256 if (!aLanguage || aLanguage->GetLength() < 2) { 257 return false; 258 } 259 const char16_t* text = aLanguage->GetUTF16String(); 260 if ((ToLowerCaseASCII(text[0]) == char16_t('j') && 261 ToLowerCaseASCII(text[1]) == char16_t('a')) || 262 (ToLowerCaseASCII(text[0]) == char16_t('z') && 263 ToLowerCaseASCII(text[1]) == char16_t('h'))) { 264 return aLanguage->GetLength() == 2 || text[2] == '-'; 265 } 266 return false; 267 }(); 268 bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0; 269 uint32_t i; 270 for (i = 0; i < aLength; ++i) { 271 CharT ch = aText[i]; 272 // CSS Text 3 - 4.1. The White Space Processing Rules 273 // White space processing in CSS affects only the document white space 274 // characters: spaces (U+0020), tabs (U+0009), and segment breaks. 275 // Since we need the context of segment breaks and their surrounding 276 // white spaces to proceed the white space processing, a consecutive run 277 // of spaces/tabs/segment breaks is collected in a first pass loop, then 278 // we apply the collapsing and transformation rules to this run in a 279 // second pass loop. 280 if (IsSpaceOrTabOrSegmentBreak(ch)) { 281 bool keepLastSpace = false; 282 bool hasSegmentBreak = IsSegmentBreak(ch); 283 uint32_t countTrailingDiscardables = 0; 284 uint32_t j; 285 for (j = i + 1; j < aLength && (IsSpaceOrTabOrSegmentBreak(aText[j]) || 286 IsDiscardable(aText[j], &flags)); 287 j++) { 288 if (IsSegmentBreak(aText[j])) { 289 hasSegmentBreak = true; 290 } 291 } 292 // Exclude trailing discardables before checking space combining 293 // sequence tail. 294 for (; IsDiscardable(aText[j - 1], &flags); j--) { 295 countTrailingDiscardables++; 296 } 297 // If the last white space is followed by a combining sequence tail, 298 // exclude it from the range of TransformWhiteSpaces. 299 if (sizeof(CharT) > 1 && aText[j - 1] == ' ' && j < aLength && 300 IsSpaceCombiningSequenceTail(&aText[j], aLength - j)) { 301 keepLastSpace = true; 302 j--; 303 } 304 if (j > i) { 305 aOutput = TransformWhiteSpaces( 306 aText, aLength, i, j, hasSegmentBreak, inWhitespace, aOutput, 307 flags, aCompression, aSkipChars, langIsJapaneseOrChinese); 308 } 309 // We need to keep KeepChar()/SkipChar() in order, so process the 310 // last white space first, then process the trailing discardables. 311 if (keepLastSpace) { 312 keepLastSpace = false; 313 *aOutput++ = ' '; 314 aSkipChars->KeepChar(); 315 lastCharArabic = false; 316 j++; 317 } 318 for (; countTrailingDiscardables > 0; countTrailingDiscardables--) { 319 aSkipChars->SkipChar(); 320 j++; 321 } 322 i = j - 1; 323 continue; 324 } 325 // Process characters other than the document white space characters. 326 if (IsDiscardable(ch, &flags)) { 327 aSkipChars->SkipChar(); 328 } else { 329 *aOutput++ = ch; 330 aSkipChars->KeepChar(); 331 } 332 lastCharArabic = IS_ARABIC_CHAR(ch); 333 inWhitespace = false; 334 } 335 336 if (lastCharArabic) { 337 *aIncomingFlags |= INCOMING_ARABICCHAR; 338 } else { 339 *aIncomingFlags &= ~INCOMING_ARABICCHAR; 340 } 341 if (inWhitespace) { 342 *aIncomingFlags |= INCOMING_WHITESPACE; 343 } else { 344 *aIncomingFlags &= ~INCOMING_WHITESPACE; 345 } 346 } 347 348 *aAnalysisFlags = flags; 349 350 #ifdef DEBUG 351 AssertSkippedExpectedChars(aText, *aSkipChars, skipCharsOffset); 352 #endif 353 return aOutput; 354 } 355 356 /* 357 * NOTE: The TransformText and IsSkippableCharacterForTransformText template 358 * functions are part of the public API of nsTextFrameUtils, while 359 * their function bodies are not available in the header. They may stop working 360 * (fail to resolve symbol in link time) once their callsites are moved to a 361 * different translation unit (e.g. a different unified source file). 362 * Explicit instantiating this function template with `uint8_t` and `char16_t` 363 * could prevent us from the potential risk. 364 */ 365 template uint8_t* nsTextFrameUtils::TransformText( 366 const uint8_t* aText, uint32_t aLength, uint8_t* aOutput, 367 CompressionMode aCompression, uint8_t* aIncomingFlags, 368 gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage); 369 template char16_t* nsTextFrameUtils::TransformText( 370 const char16_t* aText, uint32_t aLength, char16_t* aOutput, 371 CompressionMode aCompression, uint8_t* aIncomingFlags, 372 gfxSkipChars* aSkipChars, Flags* aAnalysisFlags, const nsAtom* aLanguage); 373 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText( 374 uint8_t aChar); 375 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText( 376 char16_t aChar); 377 378 template <typename CharT> 379 static uint32_t DoComputeApproximateLengthWithWhitespaceCompression( 380 const CharT* aChars, uint32_t aLength, const nsStyleText* aStyleText) { 381 // This is an approximation so we don't really need anything 382 // too fancy here. 383 uint32_t len; 384 if (aStyleText->WhiteSpaceIsSignificant()) { 385 return aLength; 386 } 387 bool prevWS = true; // more important to ignore blocks with 388 // only whitespace than get inline boundaries 389 // exactly right 390 len = 0; 391 for (uint32_t i = 0; i < aLength; ++i) { 392 CharT c = aChars[i]; 393 if (c == ' ' || c == '\n' || c == '\t' || c == '\r') { 394 if (!prevWS) { 395 ++len; 396 } 397 prevWS = true; 398 } else { 399 ++len; 400 prevWS = false; 401 } 402 } 403 return len; 404 } 405 406 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression( 407 Text* aText, const nsStyleText* aStyleText) { 408 const CharacterDataBuffer* characterDataBuffer = &aText->DataBuffer(); 409 if (characterDataBuffer->Is2b()) { 410 return DoComputeApproximateLengthWithWhitespaceCompression( 411 characterDataBuffer->Get2b(), characterDataBuffer->GetLength(), 412 aStyleText); 413 } 414 return DoComputeApproximateLengthWithWhitespaceCompression( 415 characterDataBuffer->Get1b(), characterDataBuffer->GetLength(), 416 aStyleText); 417 } 418 419 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression( 420 const nsAString& aString, const nsStyleText* aStyleText) { 421 return DoComputeApproximateLengthWithWhitespaceCompression( 422 aString.BeginReading(), aString.Length(), aStyleText); 423 } 424 425 bool nsSkipCharsRunIterator::NextRun() { 426 do { 427 if (mRunLength) { 428 mIterator.AdvanceOriginal(mRunLength); 429 NS_ASSERTION(mRunLength > 0, 430 "No characters in run (initial length too large?)"); 431 if (!mSkipped || mLengthIncludesSkipped) { 432 mRemainingLength -= mRunLength; 433 } 434 } 435 if (!mRemainingLength) { 436 return false; 437 } 438 int32_t length; 439 mSkipped = mIterator.IsOriginalCharSkipped(&length); 440 mRunLength = std::min(length, mRemainingLength); 441 } while (!mVisitSkipped && mSkipped); 442 443 return true; 444 }