TextDirectiveCreator.cpp (36786B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "TextDirectiveCreator.h" 8 9 #include "AbstractRange.h" 10 #include "Document.h" 11 #include "StaticRange.h" 12 #include "TextDirectiveUtil.h" 13 #include "mozilla/ErrorResult.h" 14 #include "mozilla/ResultVariant.h" 15 #include "nsFind.h" 16 #include "nsINode.h" 17 #include "nsRange.h" 18 19 namespace mozilla::dom { 20 21 TextDirectiveCreator::TextDirectiveCreator(Document* aDocument, 22 AbstractRange* aRange, 23 const TimeoutWatchdog* aWatchdog) 24 : mDocument(WrapNotNull(aDocument)), 25 mRange(WrapNotNull(aRange)), 26 mFinder(WrapNotNull(new nsFind())), 27 mWatchdog(aWatchdog) { 28 mFinder->SetNodeIndexCache(&mNodeIndexCache); 29 } 30 31 TextDirectiveCreator::~TextDirectiveCreator() { 32 mFinder->SetNodeIndexCache(nullptr); 33 } 34 35 /* static */ 36 mozilla::Result<nsCString, ErrorResult> 37 TextDirectiveCreator::CreateTextDirectiveFromRange( 38 Document* aDocument, AbstractRange* aInputRange, 39 const TimeoutWatchdog* aWatchdog) { 40 MOZ_ASSERT(aInputRange); 41 MOZ_ASSERT(!aInputRange->Collapsed()); 42 43 const RefPtr<AbstractRange> extendedRange = 44 MOZ_TRY(ExtendRangeToWordBoundaries(aInputRange)); 45 if (!extendedRange) { 46 return VoidCString(); 47 } 48 UniquePtr<TextDirectiveCreator> instance = 49 MOZ_TRY(CreateInstance(aDocument, extendedRange, aWatchdog)); 50 51 const bool succeededBuildingContextTerms = 52 MOZ_TRY(instance->CollectContextTerms()); 53 if (!succeededBuildingContextTerms) { 54 return VoidCString(); 55 } 56 instance->CollectContextTermWordBoundaryDistances(); 57 MOZ_TRY(instance->FindAllMatchingCandidates()); 58 return instance->CreateTextDirective(); 59 } 60 61 /* static */ Result<bool, ErrorResult> 62 TextDirectiveCreator::MustUseRangeBasedMatching(AbstractRange* aRange) { 63 MOZ_ASSERT(aRange); 64 if (TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Right>( 65 *aRange) 66 .isSome()) { 67 TEXT_FRAGMENT_LOG( 68 "Use range-based matching because the target range contains a block " 69 "boundary."); 70 return true; 71 } 72 const nsString rangeContent = 73 MOZ_TRY(TextDirectiveUtil::RangeContentAsString(aRange)); 74 75 const uint32_t kMaxLength = StaticPrefs:: 76 dom_text_fragments_create_text_fragment_exact_match_max_length(); 77 const bool rangeTooLong = rangeContent.Length() > kMaxLength; 78 if (rangeTooLong) { 79 TEXT_FRAGMENT_LOG( 80 "Use range-based matching because the target range is too long " 81 "({} chars > {} threshold)", 82 rangeContent.Length(), kMaxLength); 83 } else { 84 TEXT_FRAGMENT_LOG("Use exact matching."); 85 } 86 return rangeTooLong; 87 } 88 89 Result<UniquePtr<TextDirectiveCreator>, ErrorResult> 90 TextDirectiveCreator::CreateInstance(Document* aDocument, AbstractRange* aRange, 91 const TimeoutWatchdog* aWatchdog) { 92 return MOZ_TRY(MustUseRangeBasedMatching(aRange)) 93 ? UniquePtr<TextDirectiveCreator>( 94 new RangeBasedTextDirectiveCreator(aDocument, aRange, 95 aWatchdog)) 96 : UniquePtr<TextDirectiveCreator>( 97 new ExactMatchTextDirectiveCreator(aDocument, aRange, 98 aWatchdog)); 99 } 100 101 /*static*/ 102 Result<RefPtr<AbstractRange>, ErrorResult> 103 TextDirectiveCreator::ExtendRangeToWordBoundaries(AbstractRange* aRange) { 104 MOZ_ASSERT(aRange && !aRange->Collapsed()); 105 ErrorResult rv; 106 const nsString rangeContent = 107 MOZ_TRY(TextDirectiveUtil::RangeContentAsString(aRange)); 108 TEXT_FRAGMENT_LOG("Input range :\n{}", NS_ConvertUTF16toUTF8(rangeContent)); 109 110 if (rangeContent.IsEmpty()) { 111 TEXT_FRAGMENT_LOG("Input range does not contain text."); 112 return {nullptr}; 113 } 114 115 if (std::all_of(rangeContent.View().cbegin(), rangeContent.View().cend(), 116 nsContentUtils::IsHTMLWhitespaceOrNBSP)) { 117 TEXT_FRAGMENT_LOG("Input range contains only whitespace."); 118 return {nullptr}; 119 } 120 if (std::all_of(rangeContent.View().cbegin(), rangeContent.View().cend(), 121 IsPunctuationForWordSelect)) { 122 RangeBoundary startPoint = TextDirectiveUtil::FindNextNonWhitespacePosition< 123 TextScanDirection::Right>(aRange->StartRef()); 124 RangeBoundary endPoint = TextDirectiveUtil::FindNextNonWhitespacePosition< 125 TextScanDirection::Left>(aRange->EndRef()); 126 RefPtr range = StaticRange::Create(startPoint, endPoint, rv); 127 if (MOZ_UNLIKELY(rv.Failed())) { 128 return Err(std::move(rv)); 129 } 130 return {range}; 131 } 132 RangeBoundary startPoint = TextDirectiveUtil::FindNextNonWhitespacePosition< 133 TextScanDirection::Right>(aRange->StartRef()); 134 startPoint = 135 TextDirectiveUtil::FindWordBoundary<TextScanDirection::Left>(startPoint); 136 137 RangeBoundary endPoint = 138 TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>( 139 aRange->EndRef()); 140 endPoint = 141 TextDirectiveUtil::FindWordBoundary<TextScanDirection::Right>(endPoint); 142 #if MOZ_DIAGNOSTIC_ASSERT_ENABLED 143 auto cmp = nsContentUtils::ComparePoints(startPoint, endPoint); 144 MOZ_DIAGNOSTIC_ASSERT( 145 cmp && *cmp != 1, 146 "The new end point must not be before the start point."); 147 #endif 148 149 if (startPoint.IsSetAndValid() && endPoint.IsSetAndValid()) { 150 ErrorResult rv; 151 RefPtr<AbstractRange> range = StaticRange::Create(startPoint, endPoint, rv); 152 if (MOZ_UNLIKELY(rv.Failed())) { 153 return Err(std::move(rv)); 154 } 155 if (!range->Collapsed()) { 156 TEXT_FRAGMENT_LOG( 157 "Expanded target range to word boundaries:\n{}", 158 NS_ConvertUTF16toUTF8( 159 TextDirectiveUtil::RangeContentAsString(range).unwrapOr( 160 u"<Could not be converted to string>"_ns))); 161 162 return range; 163 } 164 } 165 TEXT_FRAGMENT_LOG("Extending to word boundaries collapsed the range."); 166 return {nullptr}; 167 } 168 169 Result<bool, ErrorResult> 170 ExactMatchTextDirectiveCreator::CollectContextTerms() { 171 if (MOZ_UNLIKELY(mRange->Collapsed())) { 172 return false; 173 } 174 TEXT_FRAGMENT_LOG("Collecting context terms for the target range."); 175 MOZ_TRY(CollectPrefixContextTerm()); 176 MOZ_TRY(CollectSuffixContextTerm()); 177 mStartContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(mRange)); 178 TEXT_FRAGMENT_LOG("Start term:\n{}", NS_ConvertUTF16toUTF8(mStartContent)); 179 TEXT_FRAGMENT_LOG("No end term present (exact match)."); 180 return true; 181 } 182 183 Result<bool, ErrorResult> 184 RangeBasedTextDirectiveCreator::CollectContextTerms() { 185 if (MOZ_UNLIKELY(mRange->Collapsed())) { 186 return false; 187 } 188 TEXT_FRAGMENT_LOG("Collecting context terms for the target range."); 189 MOZ_TRY(CollectPrefixContextTerm()); 190 MOZ_TRY(CollectSuffixContextTerm()); 191 if (const Maybe<RangeBoundary> firstBlockBoundaryInRange = 192 TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Right>( 193 *mRange)) { 194 TEXT_FRAGMENT_LOG( 195 "Target range contains a block boundary, collecting start and end " 196 "terms by considering the closest block boundaries inside the range."); 197 ErrorResult rv; 198 RefPtr startRange = 199 StaticRange::Create(mRange->StartRef(), *firstBlockBoundaryInRange, rv); 200 if (MOZ_UNLIKELY(rv.Failed())) { 201 return Err(std::move(rv)); 202 } 203 MOZ_DIAGNOSTIC_ASSERT(!startRange->Collapsed()); 204 mStartContent = 205 MOZ_TRY(TextDirectiveUtil::RangeContentAsString(startRange)); 206 if (MOZ_UNLIKELY(mStartContent.IsEmpty())) { 207 TEXT_FRAGMENT_LOG("Somehow got empty start term. Aborting."); 208 return false; 209 } 210 const Maybe<RangeBoundary> lastBlockBoundaryInRange = 211 TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Left>( 212 *mRange); 213 MOZ_DIAGNOSTIC_ASSERT( 214 lastBlockBoundaryInRange.isSome(), 215 "If the target range contains a block boundary looking left-to-right, " 216 "it must also contain one looking right-to-left"); 217 RefPtr endRange = 218 StaticRange::Create(*lastBlockBoundaryInRange, mRange->EndRef(), rv); 219 if (MOZ_UNLIKELY(rv.Failed())) { 220 return Err(std::move(rv)); 221 } 222 MOZ_DIAGNOSTIC_ASSERT(!endRange->Collapsed()); 223 mEndContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(endRange)); 224 if (MOZ_UNLIKELY(mEndContent.IsEmpty())) { 225 TEXT_FRAGMENT_LOG("Somehow got empty end term. Aborting."); 226 return false; 227 } 228 } else { 229 TEXT_FRAGMENT_LOG( 230 "Target range is too long, collecting start and end by dividing " 231 "content in the middle."); 232 mStartContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(mRange)); 233 MOZ_DIAGNOSTIC_ASSERT( 234 mStartContent.Length() > 235 StaticPrefs:: 236 dom_text_fragments_create_text_fragment_exact_match_max_length()); 237 const auto [wordStart, wordEnd] = 238 intl::WordBreaker::FindWord(mStartContent, mStartContent.Length() / 2); 239 // This check is fine because the range content strings have compressed 240 // whitespace, therefore first and last character cannot be whitespace. 241 if (wordStart == 0 && wordEnd == mStartContent.Length()) { 242 TEXT_FRAGMENT_LOG( 243 "Target range only contains one word, which is longer than the " 244 "maximum length. Aborting."); 245 return false; 246 } 247 248 // These cases are hit when `mStartContent` contains a very large (>50% of 249 // the total length) word. Then the wordbreaker would return wordEnd=length 250 // if the long word is at the end of the string. In that case, use the 251 // wordStart position to break, so that mEndContent is not empty. 252 if (wordEnd == mStartContent.Length()) { 253 mEndContent = Substring(mStartContent, wordStart); 254 mStartContent = Substring(mStartContent, 0, wordStart); 255 } else { 256 mEndContent = Substring(mStartContent, wordEnd); 257 mStartContent = Substring(mStartContent, 0, wordEnd); 258 } 259 } 260 if (mStartContent.Length() > kMaxContextTermLength) { 261 TEXT_FRAGMENT_LOG( 262 "Start term seems very long ({} chars), " 263 "only considering the first {} chars.", 264 mStartContent.Length(), kMaxContextTermLength); 265 mStartContent = Substring(mStartContent, 0, kMaxContextTermLength); 266 } 267 mStartContent.CompressWhitespace(); 268 mStartFoldCaseContent = mStartContent; 269 ToFoldedCase(mStartFoldCaseContent); 270 TEXT_FRAGMENT_LOG("Maximum possible start term:\n{}", 271 NS_ConvertUTF16toUTF8(mStartContent)); 272 if (mEndContent.Length() > kMaxContextTermLength) { 273 TEXT_FRAGMENT_LOG( 274 "End term seems very long ({} chars), " 275 "only considering the last {} chars.", 276 mEndContent.Length(), kMaxContextTermLength); 277 mEndContent = 278 Substring(mEndContent, mEndContent.Length() - kMaxContextTermLength); 279 } 280 mEndContent.CompressWhitespace(); 281 mEndFoldCaseContent = mEndContent; 282 ToFoldedCase(mEndFoldCaseContent); 283 TEXT_FRAGMENT_LOG("Maximum possible end term:\n{}", 284 NS_ConvertUTF16toUTF8(mEndContent)); 285 return true; 286 } 287 288 Result<Ok, ErrorResult> TextDirectiveCreator::CollectPrefixContextTerm() { 289 TEXT_FRAGMENT_LOG("Collecting prefix term for the target range."); 290 ErrorResult rv; 291 RangeBoundary prefixEnd = 292 TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>( 293 mRange->StartRef()); 294 RangeBoundary prefixStart = 295 TextDirectiveUtil::FindNextBlockBoundary<TextScanDirection::Left>( 296 prefixEnd); 297 RefPtr prefixRange = StaticRange::Create(prefixStart, prefixEnd, rv); 298 if (MOZ_UNLIKELY(rv.Failed())) { 299 return Err(std::move(rv)); 300 } 301 MOZ_ASSERT(prefixRange); 302 mPrefixContent = 303 MOZ_TRY(TextDirectiveUtil::RangeContentAsString(prefixRange)); 304 if (mPrefixContent.Length() > kMaxContextTermLength) { 305 // cut off the prefix content at a word boundary near the max context term 306 // length to make sure the term does not start with whitespace. 307 auto [wordBegin, wordEnd] = intl::WordBreaker::FindWord( 308 mPrefixContent, mPrefixContent.Length() - kMaxContextTermLength); 309 while (nsContentUtils::IsHTMLWhitespace(mPrefixContent.CharAt(wordBegin))) { 310 ++wordBegin; 311 } 312 TEXT_FRAGMENT_LOG( 313 "Prefix term seems very long ({} chars), " 314 "only considering the last {} chars.", 315 mPrefixContent.Length(), wordBegin); 316 mPrefixContent = Substring(mPrefixContent, wordBegin); 317 } 318 mPrefixFoldCaseContent = mPrefixContent; 319 ToFoldedCase(mPrefixFoldCaseContent); 320 TEXT_FRAGMENT_LOG("Maximum possible prefix term:\n{}", 321 NS_ConvertUTF16toUTF8(mPrefixContent)); 322 return Ok(); 323 } 324 325 Result<Ok, ErrorResult> TextDirectiveCreator::CollectSuffixContextTerm() { 326 TEXT_FRAGMENT_LOG("Collecting suffix term for the target range."); 327 ErrorResult rv; 328 RangeBoundary suffixBegin = TextDirectiveUtil::FindNextNonWhitespacePosition< 329 TextScanDirection::Right>(mRange->EndRef()); 330 RangeBoundary suffixEnd = 331 TextDirectiveUtil::FindNextBlockBoundary<TextScanDirection::Right>( 332 suffixBegin); 333 RefPtr suffixRange = StaticRange::Create(suffixBegin, suffixEnd, rv); 334 if (MOZ_UNLIKELY(rv.Failed())) { 335 return Err(std::move(rv)); 336 } 337 MOZ_ASSERT(suffixRange); 338 mSuffixContent = 339 MOZ_TRY(TextDirectiveUtil::RangeContentAsString(suffixRange)); 340 if (mSuffixContent.Length() > kMaxContextTermLength) { 341 // cut off the suffix content at a word boundary near the max context term 342 // length to make sure the term does not end with whitespace. 343 auto [wordBegin, wordEnd] = 344 intl::WordBreaker::FindWord(mSuffixContent, kMaxContextTermLength); 345 while ( 346 nsContentUtils::IsHTMLWhitespace(mSuffixContent.CharAt(wordEnd - 1))) { 347 --wordEnd; 348 } 349 TEXT_FRAGMENT_LOG( 350 "Suffix term seems very long ({} chars), " 351 "only considering the first {} chars.", 352 mSuffixContent.Length(), wordEnd + 1); 353 mSuffixContent = Substring(mSuffixContent, 0, wordEnd + 1); 354 } 355 mSuffixFoldCaseContent = mSuffixContent; 356 ToFoldedCase(mSuffixFoldCaseContent); 357 TEXT_FRAGMENT_LOG("Maximum possible suffix term:\n{}", 358 NS_ConvertUTF16toUTF8(mSuffixContent)); 359 return Ok(); 360 } 361 362 void ExactMatchTextDirectiveCreator::CollectContextTermWordBoundaryDistances() { 363 mPrefixWordBeginDistances = 364 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>( 365 mPrefixContent); 366 TEXT_FRAGMENT_LOG("Word begin distances for prefix term: {}", 367 mPrefixWordBeginDistances); 368 mSuffixWordEndDistances = 369 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>( 370 mSuffixContent); 371 TEXT_FRAGMENT_LOG("Word end distances for suffix term: {}", 372 mSuffixWordEndDistances); 373 } 374 375 void RangeBasedTextDirectiveCreator::CollectContextTermWordBoundaryDistances() { 376 mPrefixWordBeginDistances = 377 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>( 378 mPrefixContent); 379 TEXT_FRAGMENT_LOG("Word begin distances for prefix term: {}", 380 mPrefixWordBeginDistances); 381 MOZ_DIAGNOSTIC_ASSERT(!mStartContent.IsEmpty()); 382 mStartWordEndDistances = 383 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>( 384 mStartContent); 385 MOZ_DIAGNOSTIC_ASSERT(!mStartWordEndDistances.IsEmpty(), 386 "There must be at least one word in the start term."); 387 MOZ_DIAGNOSTIC_ASSERT(mStartWordEndDistances[0] > 0); 388 mFirstWordOfStartContent = 389 Substring(mStartContent, 0, mStartWordEndDistances[0]); 390 TEXT_FRAGMENT_LOG("First word of start term: {}", 391 NS_ConvertUTF16toUTF8(mFirstWordOfStartContent)); 392 if (mStartWordEndDistances[0] == mStartContent.Length()) { 393 mStartFirstWordLengthIncludingWhitespace = mStartContent.Length(); 394 mStartWordEndDistances.Clear(); 395 TEXT_FRAGMENT_LOG("Start term cannot be extended."); 396 } else { 397 mStartFirstWordLengthIncludingWhitespace = 398 TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray< 399 TextScanDirection::Right>(mStartFoldCaseContent, 400 mStartWordEndDistances); 401 TEXT_FRAGMENT_LOG( 402 "Word end distances for start term, starting at the beginning of the " 403 "second word: {}", 404 mStartWordEndDistances); 405 } 406 407 MOZ_DIAGNOSTIC_ASSERT(!mEndContent.IsEmpty()); 408 mEndWordBeginDistances = 409 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>( 410 mEndContent); 411 MOZ_DIAGNOSTIC_ASSERT(!mEndWordBeginDistances.IsEmpty(), 412 "There must be at least one word in the end term."); 413 MOZ_DIAGNOSTIC_ASSERT(mEndWordBeginDistances[0] > 0); 414 mLastWordOfEndContent = 415 Substring(mEndContent, mEndContent.Length() - mEndWordBeginDistances[0]); 416 TEXT_FRAGMENT_LOG("Last word of end term: {}", 417 NS_ConvertUTF16toUTF8(mLastWordOfEndContent)); 418 if (mEndWordBeginDistances[0] == mEndContent.Length()) { 419 mEndLastWordLengthIncludingWhitespace = mEndContent.Length(); 420 mEndWordBeginDistances.Clear(); 421 TEXT_FRAGMENT_LOG("End term cannot be extended."); 422 } else { 423 mEndLastWordLengthIncludingWhitespace = 424 TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray< 425 TextScanDirection::Left>(mEndFoldCaseContent, 426 mEndWordBeginDistances); 427 TEXT_FRAGMENT_LOG( 428 "Word begin distances for end term, starting at the end of the second " 429 "last word: {}", 430 mEndWordBeginDistances); 431 } 432 433 mSuffixWordEndDistances = 434 TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>( 435 mSuffixContent); 436 TEXT_FRAGMENT_LOG("Word end distances for suffix term: {}", 437 mSuffixWordEndDistances); 438 } 439 440 Result<nsTArray<RefPtr<AbstractRange>>, ErrorResult> 441 TextDirectiveCreator::FindAllMatchingRanges(const nsString& aSearchQuery, 442 const RangeBoundary& aSearchStart, 443 const RangeBoundary& aSearchEnd) { 444 MOZ_ASSERT(!aSearchQuery.IsEmpty()); 445 RangeBoundary searchStart = aSearchStart; 446 nsTArray<RefPtr<AbstractRange>> matchingRanges; 447 448 while (true) { 449 if (mWatchdog && mWatchdog->IsDone()) { 450 return matchingRanges; 451 } 452 RefPtr<AbstractRange> searchResult = TextDirectiveUtil::FindStringInRange( 453 mFinder, searchStart, aSearchEnd, aSearchQuery, true, true); 454 if (!searchResult || searchResult->Collapsed()) { 455 break; 456 } 457 searchStart = searchResult->StartRef(); 458 if (auto cmp = nsContentUtils::ComparePoints(searchStart, aSearchEnd, 459 &mNodeIndexCache); 460 !cmp || *cmp != -1) { 461 // this means hitting a bug in nsFind which apparently does not stop 462 // exactly where it is told to. There are cases where it might 463 // overshoot, e.g. if `aSearchEnd` is a text node with offset=0. 464 // However, due to reusing the cache used by nsFind this additional call 465 // to ComparePoints should be very cheap. 466 break; 467 } 468 matchingRanges.AppendElement(searchResult); 469 470 MOZ_DIAGNOSTIC_ASSERT(searchResult->GetStartContainer()->IsText()); 471 auto newSearchStart = 472 TextDirectiveUtil::MoveToNextBoundaryPoint(searchStart); 473 MOZ_DIAGNOSTIC_ASSERT(newSearchStart != searchStart); 474 searchStart = newSearchStart; 475 if (auto cmp = nsContentUtils::ComparePoints(searchStart, aSearchEnd, 476 &mNodeIndexCache); 477 !cmp || *cmp != -1) { 478 break; 479 } 480 } 481 482 TEXT_FRAGMENT_LOG( 483 "Found {} matches for the input '{}' in the partial document.", 484 matchingRanges.Length(), NS_ConvertUTF16toUTF8(aSearchQuery)); 485 return matchingRanges; 486 } 487 488 Result<Ok, ErrorResult> 489 ExactMatchTextDirectiveCreator::FindAllMatchingCandidates() { 490 if (MOZ_UNLIKELY(mRange->Collapsed())) { 491 return Ok(); 492 } 493 494 TEXT_FRAGMENT_LOG( 495 "Searching all occurrences of range content ({}) in the partial document " 496 "from document begin to begin of target range.", 497 NS_ConvertUTF16toUTF8(mStartContent)); 498 const nsTArray<RefPtr<AbstractRange>> matchRanges = 499 MOZ_TRY(FindAllMatchingRanges(mStartContent, {mDocument, 0u}, 500 mRange->StartRef())); 501 FindCommonSubstringLengths(matchRanges); 502 return Ok(); 503 } 504 505 void ExactMatchTextDirectiveCreator::FindCommonSubstringLengths( 506 const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) { 507 if (mWatchdog && mWatchdog->IsDone()) { 508 return; 509 } 510 size_t loopCounter = 0; 511 for (const auto& range : aMatchRanges) { 512 TEXT_FRAGMENT_LOG("Computing common prefix substring length for match {}.", 513 ++loopCounter); 514 const uint32_t commonPrefixLength = 515 TextDirectiveUtil::ComputeCommonSubstringLength< 516 TextScanDirection::Left>( 517 mPrefixFoldCaseContent, 518 TextDirectiveUtil::FindNextNonWhitespacePosition< 519 TextScanDirection::Left>(range->StartRef())); 520 521 TEXT_FRAGMENT_LOG("Computing common suffix substring length for match {}.", 522 loopCounter); 523 const uint32_t commonSuffixLength = 524 TextDirectiveUtil::ComputeCommonSubstringLength< 525 TextScanDirection::Right>( 526 mSuffixFoldCaseContent, 527 TextDirectiveUtil::FindNextNonWhitespacePosition< 528 TextScanDirection::Right>(range->EndRef())); 529 530 mCommonSubstringLengths.EmplaceBack(commonPrefixLength, commonSuffixLength); 531 } 532 } 533 534 Result<Ok, ErrorResult> 535 RangeBasedTextDirectiveCreator::FindAllMatchingCandidates() { 536 MOZ_DIAGNOSTIC_ASSERT(!mFirstWordOfStartContent.IsEmpty(), 537 "Minimal start content must not be empty."); 538 MOZ_DIAGNOSTIC_ASSERT(!mLastWordOfEndContent.IsEmpty(), 539 "Minimal end content must not be empty."); 540 541 TEXT_FRAGMENT_LOG( 542 "Searching all occurrences of first word of start content ({}) in the " 543 "partial document from document begin to begin of the target range.", 544 NS_ConvertUTF16toUTF8(mFirstWordOfStartContent)); 545 546 const nsTArray<RefPtr<AbstractRange>> startContentRanges = 547 MOZ_TRY(FindAllMatchingRanges(mFirstWordOfStartContent, {mDocument, 0u}, 548 mRange->StartRef())); 549 FindStartMatchCommonSubstringLengths(startContentRanges); 550 551 if (mWatchdog && mWatchdog->IsDone()) { 552 return Ok(); 553 } 554 TEXT_FRAGMENT_LOG( 555 "Searching all occurrences of last word of end content ({}) in the " 556 "partial document from beginning of the target range to the end of the " 557 "target range, excluding the last word.", 558 NS_ConvertUTF16toUTF8(mLastWordOfEndContent)); 559 560 auto searchEnd = 561 TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>( 562 mRange->EndRef()); 563 searchEnd = 564 TextDirectiveUtil::FindWordBoundary<TextScanDirection::Left>(searchEnd); 565 566 const nsTArray<RefPtr<AbstractRange>> endContentRanges = 567 MOZ_TRY(FindAllMatchingRanges(mLastWordOfEndContent, mRange->StartRef(), 568 searchEnd)); 569 FindEndMatchCommonSubstringLengths(endContentRanges); 570 return Ok(); 571 } 572 573 void RangeBasedTextDirectiveCreator::FindStartMatchCommonSubstringLengths( 574 const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) { 575 size_t loopCounter = 0; 576 for (const auto& range : aMatchRanges) { 577 if (mWatchdog && mWatchdog->IsDone()) { 578 return; 579 } 580 ++loopCounter; 581 TEXT_FRAGMENT_LOG( 582 "Computing common prefix substring length for start match {}.", 583 loopCounter); 584 const uint32_t commonPrefixLength = 585 TextDirectiveUtil::ComputeCommonSubstringLength< 586 TextScanDirection::Left>( 587 mPrefixFoldCaseContent, 588 TextDirectiveUtil::FindNextNonWhitespacePosition< 589 TextScanDirection::Left>(range->StartRef())); 590 TEXT_FRAGMENT_LOG("Common prefix length: {}", commonPrefixLength); 591 592 TEXT_FRAGMENT_LOG( 593 "Computing common start substring length for start match {}.", 594 loopCounter); 595 const uint32_t commonStartLength = 596 TextDirectiveUtil::ComputeCommonSubstringLength< 597 TextScanDirection::Right>( 598 mStartFoldCaseContent, 599 TextDirectiveUtil::FindNextNonWhitespacePosition< 600 TextScanDirection::Right>(range->EndRef())); 601 602 TEXT_FRAGMENT_LOG("Common length: {}", commonStartLength); 603 mStartMatchCommonSubstringLengths.EmplaceBack(commonPrefixLength, 604 commonStartLength); 605 } 606 } 607 608 void RangeBasedTextDirectiveCreator::FindEndMatchCommonSubstringLengths( 609 const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) { 610 size_t loopCounter = 0; 611 for (const auto& range : aMatchRanges) { 612 if (mWatchdog && mWatchdog->IsDone()) { 613 return; 614 } 615 ++loopCounter; 616 TEXT_FRAGMENT_LOG("Computing common end substring length for end match {}.", 617 loopCounter); 618 const uint32_t commonEndLength = 619 TextDirectiveUtil::ComputeCommonSubstringLength< 620 TextScanDirection::Left>( 621 mEndFoldCaseContent, 622 TextDirectiveUtil::FindNextNonWhitespacePosition< 623 TextScanDirection::Left>(range->StartRef())); 624 TEXT_FRAGMENT_LOG("Common end term length: {}", commonEndLength); 625 TEXT_FRAGMENT_LOG( 626 "Computing common suffix substring length for end match {}.", 627 loopCounter); 628 const uint32_t commonSuffixLength = 629 TextDirectiveUtil::ComputeCommonSubstringLength< 630 TextScanDirection::Right>( 631 mSuffixFoldCaseContent, 632 TextDirectiveUtil::FindNextNonWhitespacePosition< 633 TextScanDirection::Right>(range->EndRef())); 634 TEXT_FRAGMENT_LOG("Common suffix length: {}", commonSuffixLength); 635 636 mEndMatchCommonSubstringLengths.EmplaceBack(commonEndLength, 637 commonSuffixLength); 638 } 639 } 640 641 Result<nsCString, ErrorResult> TextDirectiveCreator::CreateTextDirective() { 642 if (mWatchdog && mWatchdog->IsDone()) { 643 TEXT_FRAGMENT_LOG("Hitting timeout."); 644 return VoidCString(); 645 } 646 if (mRange->Collapsed()) { 647 TEXT_FRAGMENT_LOG("Input range collapsed."); 648 return VoidCString(); 649 } 650 if (mStartContent.IsEmpty()) { 651 TEXT_FRAGMENT_LOG("Input range is empty."); 652 return VoidCString(); 653 } 654 655 if (const Maybe<TextDirective> textDirective = FindShortestCombination()) { 656 nsCString textDirectiveString; 657 DebugOnly<bool> ret = 658 create_text_directive(&*textDirective, &textDirectiveString); 659 MOZ_ASSERT(ret); 660 TEXT_FRAGMENT_LOG("Created text directive: {}", textDirectiveString); 661 return textDirectiveString; 662 } 663 TEXT_FRAGMENT_LOG( 664 "It's not possible to create a text directive for the given range."); 665 return nsCString{}; 666 } 667 668 /*static*/ std::tuple<nsTArray<uint32_t>, nsTArray<uint32_t>> 669 TextDirectiveCreator::ExtendSubstringLengthsToWordBoundaries( 670 const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactSubstringLengths, 671 const Span<const uint32_t>& aFirstWordPositions, 672 const Span<const uint32_t>& aSecondWordPositions) { 673 const auto getNextWordBoundaryPosition = 674 [](const Span<const uint32_t>& distances, uint32_t length) { 675 // Note: This algorithm works for word begins and word ends, 676 // since the position arrays for properties that go right-to-left 677 // (prefix, end) are reversed and start from the end of the 678 // strings. 679 for (const uint32_t distance : distances) { 680 if (distance > length) { 681 return distance; 682 } 683 } 684 return distances.IsEmpty() ? 0 : distances.at(distances.Length() - 1); 685 }; 686 687 const auto hashSetToSortedArray = [](const nsTHashSet<uint32_t>& aHashSet) { 688 AutoTArray<uint32_t, 64> array; 689 for (auto value : aHashSet) { 690 array.InsertElementSorted(value); 691 } 692 return array; 693 }; 694 nsTHashSet<uint32_t> firstSet; 695 nsTHashSet<uint32_t> secondSet; 696 firstSet.Insert(0); 697 secondSet.Insert(0); 698 // This loop is O(n^2) in the worst case, but the number of 699 // aFirstWordPositions and aSecondWordPositions is small (< 32). 700 // Also, one of the purposes of this algorithm is to bucket the exact lengths 701 // (which represent the amount of matches for the target range) into word 702 // bounded lengths. This means that the number of unique word bounded lengths 703 // is < 32. 704 for (const auto& [first, second] : aExactSubstringLengths) { 705 firstSet.Insert(getNextWordBoundaryPosition(aFirstWordPositions, first)); 706 secondSet.Insert(getNextWordBoundaryPosition(aSecondWordPositions, second)); 707 } 708 return {hashSetToSortedArray(firstSet), hashSetToSortedArray(secondSet)}; 709 } 710 711 /*static*/ 712 Maybe<std::tuple<uint32_t, uint32_t>> 713 TextDirectiveCreator::CheckAllCombinations( 714 const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactWordLengths, 715 const nsTArray<uint32_t>& aFirstExtendedToWordBoundaries, 716 const nsTArray<uint32_t>& aSecondExtendedToWordBoundaries) { 717 nsTArray<std::tuple<uint32_t, uint32_t, uint32_t>> sortedCandidates; 718 sortedCandidates.SetCapacity(aFirstExtendedToWordBoundaries.Length() * 719 aSecondExtendedToWordBoundaries.Length()); 720 721 // Create all combinations of the extended values and sort them by their 722 // cost function value (sum of the two values). 723 // The cost function value is used to sort the candidates, so that the 724 // candidates with the lowest cost function value are checked first. Since the 725 // algorithm searches for the shortest possible combination, it can return as 726 // soon as it finds a valid combination. 727 for (const uint32_t firstExtendedToWordBoundary : 728 aFirstExtendedToWordBoundaries) { 729 for (const uint32_t secondExtendedToWordBoundary : 730 aSecondExtendedToWordBoundaries) { 731 const uint32_t costFunctionValue = 732 firstExtendedToWordBoundary + secondExtendedToWordBoundary; 733 sortedCandidates.InsertElementSorted( 734 std::tuple{firstExtendedToWordBoundary, secondExtendedToWordBoundary, 735 costFunctionValue}, 736 [](const auto& a, const auto& b) -> int { 737 return std::get<2>(a) - std::get<2>(b); 738 }); 739 } 740 } 741 for (auto [firstExtendedToWordBoundary, secondExtendedToWordBoundary, 742 costFunctionValue] : sortedCandidates) { 743 TEXT_FRAGMENT_LOG("Checking candidate ({},{}). Score: {}", 744 firstExtendedToWordBoundary, secondExtendedToWordBoundary, 745 costFunctionValue); 746 const bool isInvalid = AnyOf( 747 aExactWordLengths.begin(), aExactWordLengths.end(), 748 [firstExtended = firstExtendedToWordBoundary, 749 secondExtended = secondExtendedToWordBoundary]( 750 const std::tuple<uint32_t, uint32_t>& exactWordLengths) { 751 const auto [firstExact, secondExact] = exactWordLengths; 752 return firstExtended <= firstExact && secondExtended <= secondExact; 753 }); 754 if (isInvalid) { 755 TEXT_FRAGMENT_LOG( 756 "Current candidate doesn't eliminate all matches. Discarding this " 757 "candidate."); 758 continue; 759 } 760 TEXT_FRAGMENT_LOG("Current candidate ({},{}) is the best candidate.", 761 firstExtendedToWordBoundary, 762 secondExtendedToWordBoundary); 763 return Some( 764 std::tuple{firstExtendedToWordBoundary, secondExtendedToWordBoundary}); 765 } 766 return Nothing{}; 767 } 768 769 Maybe<TextDirective> ExactMatchTextDirectiveCreator::FindShortestCombination() 770 const { 771 const auto [prefixLengths, suffixLengths] = 772 TextDirectiveCreator::ExtendSubstringLengthsToWordBoundaries( 773 mCommonSubstringLengths, mPrefixWordBeginDistances, 774 mSuffixWordEndDistances); 775 TEXT_FRAGMENT_LOG("Find shortest combination based on prefix and suffix."); 776 TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}", 777 mCommonSubstringLengths.Length(), 778 prefixLengths.Length() * suffixLengths.Length()); 779 TEXT_FRAGMENT_LOG("Checking prefix lengths (extended to word boundaries): {}", 780 prefixLengths); 781 TEXT_FRAGMENT_LOG("Checking suffix lengths (extended to word boundaries): {}", 782 suffixLengths); 783 TEXT_FRAGMENT_LOG("Matches: {}", mCommonSubstringLengths); 784 return CheckAllCombinations(mCommonSubstringLengths, prefixLengths, 785 suffixLengths) 786 .andThen([&](std::tuple<uint32_t, uint32_t> bestMatch) { 787 const auto [prefixLength, suffixLength] = bestMatch; 788 TextDirective td; 789 if (prefixLength) { 790 td.prefix = 791 Substring(mPrefixContent, mPrefixContent.Length() - prefixLength); 792 } 793 td.start = mStartContent; 794 if (suffixLength) { 795 td.suffix = Substring(mSuffixContent, 0, suffixLength); 796 } 797 return Some(td); 798 }); 799 } 800 801 Maybe<TextDirective> RangeBasedTextDirectiveCreator::FindShortestCombination() 802 const { 803 // For this algorithm, ignore the first word of the start term and the last 804 // word of the end term (which are required). This allows the optimization 805 // algorithm to minimize to 0. 806 auto [prefixLengths, startLengths] = ExtendSubstringLengthsToWordBoundaries( 807 mStartMatchCommonSubstringLengths, mPrefixWordBeginDistances, 808 mStartWordEndDistances); 809 810 TEXT_FRAGMENT_LOG( 811 "Find shortest combination for start match based on prefix and start"); 812 TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}", 813 mStartMatchCommonSubstringLengths.Length(), 814 prefixLengths.Length() * startLengths.Length()); 815 TEXT_FRAGMENT_LOG("Checking prefix lengths (extended to word boundaries): {}", 816 prefixLengths); 817 TEXT_FRAGMENT_LOG("Checking start lengths (extended to word boundaries): {}", 818 startLengths); 819 TEXT_FRAGMENT_LOG("Matches: {}", mStartMatchCommonSubstringLengths); 820 821 const auto bestStartMatch = CheckAllCombinations( 822 mStartMatchCommonSubstringLengths, prefixLengths, startLengths); 823 if (MOZ_UNLIKELY(bestStartMatch.isNothing())) { 824 TEXT_FRAGMENT_LOG( 825 "Could not find unique start match. It's not possible to create a text " 826 "directive for the target range."); 827 return Nothing{}; 828 } 829 auto [endLengths, suffixLengths] = ExtendSubstringLengthsToWordBoundaries( 830 mEndMatchCommonSubstringLengths, mEndWordBeginDistances, 831 mSuffixWordEndDistances); 832 833 TEXT_FRAGMENT_LOG( 834 "Find shortest combination for end match based on end and suffix"); 835 TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}", 836 mEndMatchCommonSubstringLengths.Length(), 837 endLengths.Length() * suffixLengths.Length()); 838 TEXT_FRAGMENT_LOG("Checking end lengths (extended to word boundaries): {}", 839 endLengths); 840 TEXT_FRAGMENT_LOG("Checking suffix lengths (extended to word boundaries): {}", 841 suffixLengths); 842 TEXT_FRAGMENT_LOG("Matches: {}", mEndMatchCommonSubstringLengths); 843 const auto bestEndMatch = CheckAllCombinations( 844 mEndMatchCommonSubstringLengths, endLengths, suffixLengths); 845 if (MOZ_UNLIKELY(bestEndMatch.isNothing())) { 846 TEXT_FRAGMENT_LOG( 847 "Could not find unique end match. It's not possible to create a text " 848 "directive for the target range."); 849 return Nothing{}; 850 } 851 const auto [prefixLength, startLength] = *bestStartMatch; 852 const auto [endLength, suffixLength] = *bestEndMatch; 853 TextDirective td; 854 if (prefixLength) { 855 td.prefix = 856 Substring(mPrefixContent, mPrefixContent.Length() - prefixLength); 857 } 858 859 if (startLength) { 860 const uint32_t startLengthIncludingFirstWord = 861 mStartFirstWordLengthIncludingWhitespace + startLength; 862 MOZ_DIAGNOSTIC_ASSERT(startLengthIncludingFirstWord <= 863 mStartContent.Length()); 864 td.start = Substring(mStartContent, 0, startLengthIncludingFirstWord); 865 } else { 866 td.start = mFirstWordOfStartContent; 867 } 868 if (endLength) { 869 const uint32_t endLengthIncludingLastWord = 870 mEndLastWordLengthIncludingWhitespace + endLength; 871 872 MOZ_DIAGNOSTIC_ASSERT(endLengthIncludingLastWord <= mEndContent.Length()); 873 td.end = Substring(mEndContent, 874 mEndContent.Length() - endLengthIncludingLastWord); 875 } else { 876 td.end = mLastWordOfEndContent; 877 } 878 879 if (suffixLength) { 880 td.suffix = Substring(mSuffixContent, 0, suffixLength); 881 } 882 883 return Some(td); 884 } 885 886 } // namespace mozilla::dom