[ tor-browser ].git.dasho

TextDirectiveCreator.cpp (36786B)
      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "TextDirectiveCreator.h"
      8 
      9 #include "AbstractRange.h"
     10 #include "Document.h"
     11 #include "StaticRange.h"
     12 #include "TextDirectiveUtil.h"
     13 #include "mozilla/ErrorResult.h"
     14 #include "mozilla/ResultVariant.h"
     15 #include "nsFind.h"
     16 #include "nsINode.h"
     17 #include "nsRange.h"
     18 
     19 namespace mozilla::dom {
     20 
     21 TextDirectiveCreator::TextDirectiveCreator(Document* aDocument,
     22                                           AbstractRange* aRange,
     23                                           const TimeoutWatchdog* aWatchdog)
     24    : mDocument(WrapNotNull(aDocument)),
     25      mRange(WrapNotNull(aRange)),
     26      mFinder(WrapNotNull(new nsFind())),
     27      mWatchdog(aWatchdog) {
     28  mFinder->SetNodeIndexCache(&mNodeIndexCache);
     29 }
     30 
     31 TextDirectiveCreator::~TextDirectiveCreator() {
     32  mFinder->SetNodeIndexCache(nullptr);
     33 }
     34 
     35 /* static */
     36 mozilla::Result<nsCString, ErrorResult>
     37 TextDirectiveCreator::CreateTextDirectiveFromRange(
     38    Document* aDocument, AbstractRange* aInputRange,
     39    const TimeoutWatchdog* aWatchdog) {
     40  MOZ_ASSERT(aInputRange);
     41  MOZ_ASSERT(!aInputRange->Collapsed());
     42 
     43  const RefPtr<AbstractRange> extendedRange =
     44      MOZ_TRY(ExtendRangeToWordBoundaries(aInputRange));
     45  if (!extendedRange) {
     46    return VoidCString();
     47  }
     48  UniquePtr<TextDirectiveCreator> instance =
     49      MOZ_TRY(CreateInstance(aDocument, extendedRange, aWatchdog));
     50 
     51  const bool succeededBuildingContextTerms =
     52      MOZ_TRY(instance->CollectContextTerms());
     53  if (!succeededBuildingContextTerms) {
     54    return VoidCString();
     55  }
     56  instance->CollectContextTermWordBoundaryDistances();
     57  MOZ_TRY(instance->FindAllMatchingCandidates());
     58  return instance->CreateTextDirective();
     59 }
     60 
     61 /* static */ Result<bool, ErrorResult>
     62 TextDirectiveCreator::MustUseRangeBasedMatching(AbstractRange* aRange) {
     63  MOZ_ASSERT(aRange);
     64  if (TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Right>(
     65          *aRange)
     66          .isSome()) {
     67    TEXT_FRAGMENT_LOG(
     68        "Use range-based matching because the target range contains a block "
     69        "boundary.");
     70    return true;
     71  }
     72  const nsString rangeContent =
     73      MOZ_TRY(TextDirectiveUtil::RangeContentAsString(aRange));
     74 
     75  const uint32_t kMaxLength = StaticPrefs::
     76      dom_text_fragments_create_text_fragment_exact_match_max_length();
     77  const bool rangeTooLong = rangeContent.Length() > kMaxLength;
     78  if (rangeTooLong) {
     79    TEXT_FRAGMENT_LOG(
     80        "Use range-based matching because the target range is too long "
     81        "({} chars > {} threshold)",
     82        rangeContent.Length(), kMaxLength);
     83  } else {
     84    TEXT_FRAGMENT_LOG("Use exact matching.");
     85  }
     86  return rangeTooLong;
     87 }
     88 
     89 Result<UniquePtr<TextDirectiveCreator>, ErrorResult>
     90 TextDirectiveCreator::CreateInstance(Document* aDocument, AbstractRange* aRange,
     91                                     const TimeoutWatchdog* aWatchdog) {
     92  return MOZ_TRY(MustUseRangeBasedMatching(aRange))
     93             ? UniquePtr<TextDirectiveCreator>(
     94                   new RangeBasedTextDirectiveCreator(aDocument, aRange,
     95                                                      aWatchdog))
     96             : UniquePtr<TextDirectiveCreator>(
     97                   new ExactMatchTextDirectiveCreator(aDocument, aRange,
     98                                                      aWatchdog));
     99 }
    100 
    101 /*static*/
    102 Result<RefPtr<AbstractRange>, ErrorResult>
    103 TextDirectiveCreator::ExtendRangeToWordBoundaries(AbstractRange* aRange) {
    104  MOZ_ASSERT(aRange && !aRange->Collapsed());
    105  ErrorResult rv;
    106  const nsString rangeContent =
    107      MOZ_TRY(TextDirectiveUtil::RangeContentAsString(aRange));
    108  TEXT_FRAGMENT_LOG("Input range :\n{}", NS_ConvertUTF16toUTF8(rangeContent));
    109 
    110  if (rangeContent.IsEmpty()) {
    111    TEXT_FRAGMENT_LOG("Input range does not contain text.");
    112    return {nullptr};
    113  }
    114 
    115  if (std::all_of(rangeContent.View().cbegin(), rangeContent.View().cend(),
    116                  nsContentUtils::IsHTMLWhitespaceOrNBSP)) {
    117    TEXT_FRAGMENT_LOG("Input range contains only whitespace.");
    118    return {nullptr};
    119  }
    120  if (std::all_of(rangeContent.View().cbegin(), rangeContent.View().cend(),
    121                  IsPunctuationForWordSelect)) {
    122    RangeBoundary startPoint = TextDirectiveUtil::FindNextNonWhitespacePosition<
    123        TextScanDirection::Right>(aRange->StartRef());
    124    RangeBoundary endPoint = TextDirectiveUtil::FindNextNonWhitespacePosition<
    125        TextScanDirection::Left>(aRange->EndRef());
    126    RefPtr range = StaticRange::Create(startPoint, endPoint, rv);
    127    if (MOZ_UNLIKELY(rv.Failed())) {
    128      return Err(std::move(rv));
    129    }
    130    return {range};
    131  }
    132  RangeBoundary startPoint = TextDirectiveUtil::FindNextNonWhitespacePosition<
    133      TextScanDirection::Right>(aRange->StartRef());
    134  startPoint =
    135      TextDirectiveUtil::FindWordBoundary<TextScanDirection::Left>(startPoint);
    136 
    137  RangeBoundary endPoint =
    138      TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>(
    139          aRange->EndRef());
    140  endPoint =
    141      TextDirectiveUtil::FindWordBoundary<TextScanDirection::Right>(endPoint);
    142 #if MOZ_DIAGNOSTIC_ASSERT_ENABLED
    143  auto cmp = nsContentUtils::ComparePoints(startPoint, endPoint);
    144  MOZ_DIAGNOSTIC_ASSERT(
    145      cmp && *cmp != 1,
    146      "The new end point must not be before the start point.");
    147 #endif
    148 
    149  if (startPoint.IsSetAndValid() && endPoint.IsSetAndValid()) {
    150    ErrorResult rv;
    151    RefPtr<AbstractRange> range = StaticRange::Create(startPoint, endPoint, rv);
    152    if (MOZ_UNLIKELY(rv.Failed())) {
    153      return Err(std::move(rv));
    154    }
    155    if (!range->Collapsed()) {
    156      TEXT_FRAGMENT_LOG(
    157          "Expanded target range to word boundaries:\n{}",
    158          NS_ConvertUTF16toUTF8(
    159              TextDirectiveUtil::RangeContentAsString(range).unwrapOr(
    160                  u"<Could not be converted to string>"_ns)));
    161 
    162      return range;
    163    }
    164  }
    165  TEXT_FRAGMENT_LOG("Extending to word boundaries collapsed the range.");
    166  return {nullptr};
    167 }
    168 
    169 Result<bool, ErrorResult>
    170 ExactMatchTextDirectiveCreator::CollectContextTerms() {
    171  if (MOZ_UNLIKELY(mRange->Collapsed())) {
    172    return false;
    173  }
    174  TEXT_FRAGMENT_LOG("Collecting context terms for the target range.");
    175  MOZ_TRY(CollectPrefixContextTerm());
    176  MOZ_TRY(CollectSuffixContextTerm());
    177  mStartContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(mRange));
    178  TEXT_FRAGMENT_LOG("Start term:\n{}", NS_ConvertUTF16toUTF8(mStartContent));
    179  TEXT_FRAGMENT_LOG("No end term present (exact match).");
    180  return true;
    181 }
    182 
    183 Result<bool, ErrorResult>
    184 RangeBasedTextDirectiveCreator::CollectContextTerms() {
    185  if (MOZ_UNLIKELY(mRange->Collapsed())) {
    186    return false;
    187  }
    188  TEXT_FRAGMENT_LOG("Collecting context terms for the target range.");
    189  MOZ_TRY(CollectPrefixContextTerm());
    190  MOZ_TRY(CollectSuffixContextTerm());
    191  if (const Maybe<RangeBoundary> firstBlockBoundaryInRange =
    192          TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Right>(
    193              *mRange)) {
    194    TEXT_FRAGMENT_LOG(
    195        "Target range contains a block boundary, collecting start and end "
    196        "terms by considering the closest block boundaries inside the range.");
    197    ErrorResult rv;
    198    RefPtr startRange =
    199        StaticRange::Create(mRange->StartRef(), *firstBlockBoundaryInRange, rv);
    200    if (MOZ_UNLIKELY(rv.Failed())) {
    201      return Err(std::move(rv));
    202    }
    203    MOZ_DIAGNOSTIC_ASSERT(!startRange->Collapsed());
    204    mStartContent =
    205        MOZ_TRY(TextDirectiveUtil::RangeContentAsString(startRange));
    206    if (MOZ_UNLIKELY(mStartContent.IsEmpty())) {
    207      TEXT_FRAGMENT_LOG("Somehow got empty start term. Aborting.");
    208      return false;
    209    }
    210    const Maybe<RangeBoundary> lastBlockBoundaryInRange =
    211        TextDirectiveUtil::FindBlockBoundaryInRange<TextScanDirection::Left>(
    212            *mRange);
    213    MOZ_DIAGNOSTIC_ASSERT(
    214        lastBlockBoundaryInRange.isSome(),
    215        "If the target range contains a block boundary looking left-to-right, "
    216        "it must also contain one looking right-to-left");
    217    RefPtr endRange =
    218        StaticRange::Create(*lastBlockBoundaryInRange, mRange->EndRef(), rv);
    219    if (MOZ_UNLIKELY(rv.Failed())) {
    220      return Err(std::move(rv));
    221    }
    222    MOZ_DIAGNOSTIC_ASSERT(!endRange->Collapsed());
    223    mEndContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(endRange));
    224    if (MOZ_UNLIKELY(mEndContent.IsEmpty())) {
    225      TEXT_FRAGMENT_LOG("Somehow got empty end term. Aborting.");
    226      return false;
    227    }
    228  } else {
    229    TEXT_FRAGMENT_LOG(
    230        "Target range is too long, collecting start and end by dividing "
    231        "content in the middle.");
    232    mStartContent = MOZ_TRY(TextDirectiveUtil::RangeContentAsString(mRange));
    233    MOZ_DIAGNOSTIC_ASSERT(
    234        mStartContent.Length() >
    235        StaticPrefs::
    236            dom_text_fragments_create_text_fragment_exact_match_max_length());
    237    const auto [wordStart, wordEnd] =
    238        intl::WordBreaker::FindWord(mStartContent, mStartContent.Length() / 2);
    239    // This check is fine because the range content strings have compressed
    240    // whitespace, therefore first and last character cannot be whitespace.
    241    if (wordStart == 0 && wordEnd == mStartContent.Length()) {
    242      TEXT_FRAGMENT_LOG(
    243          "Target range only contains one word, which is longer than the "
    244          "maximum length. Aborting.");
    245      return false;
    246    }
    247 
    248    // These cases are hit when `mStartContent` contains a very large (>50% of
    249    // the total length) word. Then the wordbreaker would return wordEnd=length
    250    // if the long word is at the end of the string. In that case, use the
    251    // wordStart position to break, so that mEndContent is not empty.
    252    if (wordEnd == mStartContent.Length()) {
    253      mEndContent = Substring(mStartContent, wordStart);
    254      mStartContent = Substring(mStartContent, 0, wordStart);
    255    } else {
    256      mEndContent = Substring(mStartContent, wordEnd);
    257      mStartContent = Substring(mStartContent, 0, wordEnd);
    258    }
    259  }
    260  if (mStartContent.Length() > kMaxContextTermLength) {
    261    TEXT_FRAGMENT_LOG(
    262        "Start term seems very long ({} chars), "
    263        "only considering the first {} chars.",
    264        mStartContent.Length(), kMaxContextTermLength);
    265    mStartContent = Substring(mStartContent, 0, kMaxContextTermLength);
    266  }
    267  mStartContent.CompressWhitespace();
    268  mStartFoldCaseContent = mStartContent;
    269  ToFoldedCase(mStartFoldCaseContent);
    270  TEXT_FRAGMENT_LOG("Maximum possible start term:\n{}",
    271                    NS_ConvertUTF16toUTF8(mStartContent));
    272  if (mEndContent.Length() > kMaxContextTermLength) {
    273    TEXT_FRAGMENT_LOG(
    274        "End term seems very long ({} chars), "
    275        "only considering the last {} chars.",
    276        mEndContent.Length(), kMaxContextTermLength);
    277    mEndContent =
    278        Substring(mEndContent, mEndContent.Length() - kMaxContextTermLength);
    279  }
    280  mEndContent.CompressWhitespace();
    281  mEndFoldCaseContent = mEndContent;
    282  ToFoldedCase(mEndFoldCaseContent);
    283  TEXT_FRAGMENT_LOG("Maximum possible end term:\n{}",
    284                    NS_ConvertUTF16toUTF8(mEndContent));
    285  return true;
    286 }
    287 
    288 Result<Ok, ErrorResult> TextDirectiveCreator::CollectPrefixContextTerm() {
    289  TEXT_FRAGMENT_LOG("Collecting prefix term for the target range.");
    290  ErrorResult rv;
    291  RangeBoundary prefixEnd =
    292      TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>(
    293          mRange->StartRef());
    294  RangeBoundary prefixStart =
    295      TextDirectiveUtil::FindNextBlockBoundary<TextScanDirection::Left>(
    296          prefixEnd);
    297  RefPtr prefixRange = StaticRange::Create(prefixStart, prefixEnd, rv);
    298  if (MOZ_UNLIKELY(rv.Failed())) {
    299    return Err(std::move(rv));
    300  }
    301  MOZ_ASSERT(prefixRange);
    302  mPrefixContent =
    303      MOZ_TRY(TextDirectiveUtil::RangeContentAsString(prefixRange));
    304  if (mPrefixContent.Length() > kMaxContextTermLength) {
    305    // cut off the prefix content at a word boundary near the max context term
    306    // length to make sure the term does not start with whitespace.
    307    auto [wordBegin, wordEnd] = intl::WordBreaker::FindWord(
    308        mPrefixContent, mPrefixContent.Length() - kMaxContextTermLength);
    309    while (nsContentUtils::IsHTMLWhitespace(mPrefixContent.CharAt(wordBegin))) {
    310      ++wordBegin;
    311    }
    312    TEXT_FRAGMENT_LOG(
    313        "Prefix term seems very long ({} chars), "
    314        "only considering the last {} chars.",
    315        mPrefixContent.Length(), wordBegin);
    316    mPrefixContent = Substring(mPrefixContent, wordBegin);
    317  }
    318  mPrefixFoldCaseContent = mPrefixContent;
    319  ToFoldedCase(mPrefixFoldCaseContent);
    320  TEXT_FRAGMENT_LOG("Maximum possible prefix term:\n{}",
    321                    NS_ConvertUTF16toUTF8(mPrefixContent));
    322  return Ok();
    323 }
    324 
    325 Result<Ok, ErrorResult> TextDirectiveCreator::CollectSuffixContextTerm() {
    326  TEXT_FRAGMENT_LOG("Collecting suffix term for the target range.");
    327  ErrorResult rv;
    328  RangeBoundary suffixBegin = TextDirectiveUtil::FindNextNonWhitespacePosition<
    329      TextScanDirection::Right>(mRange->EndRef());
    330  RangeBoundary suffixEnd =
    331      TextDirectiveUtil::FindNextBlockBoundary<TextScanDirection::Right>(
    332          suffixBegin);
    333  RefPtr suffixRange = StaticRange::Create(suffixBegin, suffixEnd, rv);
    334  if (MOZ_UNLIKELY(rv.Failed())) {
    335    return Err(std::move(rv));
    336  }
    337  MOZ_ASSERT(suffixRange);
    338  mSuffixContent =
    339      MOZ_TRY(TextDirectiveUtil::RangeContentAsString(suffixRange));
    340  if (mSuffixContent.Length() > kMaxContextTermLength) {
    341    // cut off the suffix content at a word boundary near the max context term
    342    // length to make sure the term does not end with whitespace.
    343    auto [wordBegin, wordEnd] =
    344        intl::WordBreaker::FindWord(mSuffixContent, kMaxContextTermLength);
    345    while (
    346        nsContentUtils::IsHTMLWhitespace(mSuffixContent.CharAt(wordEnd - 1))) {
    347      --wordEnd;
    348    }
    349    TEXT_FRAGMENT_LOG(
    350        "Suffix term seems very long ({} chars), "
    351        "only considering the first {} chars.",
    352        mSuffixContent.Length(), wordEnd + 1);
    353    mSuffixContent = Substring(mSuffixContent, 0, wordEnd + 1);
    354  }
    355  mSuffixFoldCaseContent = mSuffixContent;
    356  ToFoldedCase(mSuffixFoldCaseContent);
    357  TEXT_FRAGMENT_LOG("Maximum possible suffix term:\n{}",
    358                    NS_ConvertUTF16toUTF8(mSuffixContent));
    359  return Ok();
    360 }
    361 
    362 void ExactMatchTextDirectiveCreator::CollectContextTermWordBoundaryDistances() {
    363  mPrefixWordBeginDistances =
    364      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
    365          mPrefixContent);
    366  TEXT_FRAGMENT_LOG("Word begin distances for prefix term: {}",
    367                    mPrefixWordBeginDistances);
    368  mSuffixWordEndDistances =
    369      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
    370          mSuffixContent);
    371  TEXT_FRAGMENT_LOG("Word end distances for suffix term: {}",
    372                    mSuffixWordEndDistances);
    373 }
    374 
    375 void RangeBasedTextDirectiveCreator::CollectContextTermWordBoundaryDistances() {
    376  mPrefixWordBeginDistances =
    377      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
    378          mPrefixContent);
    379  TEXT_FRAGMENT_LOG("Word begin distances for prefix term: {}",
    380                    mPrefixWordBeginDistances);
    381  MOZ_DIAGNOSTIC_ASSERT(!mStartContent.IsEmpty());
    382  mStartWordEndDistances =
    383      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
    384          mStartContent);
    385  MOZ_DIAGNOSTIC_ASSERT(!mStartWordEndDistances.IsEmpty(),
    386                        "There must be at least one word in the start term.");
    387  MOZ_DIAGNOSTIC_ASSERT(mStartWordEndDistances[0] > 0);
    388  mFirstWordOfStartContent =
    389      Substring(mStartContent, 0, mStartWordEndDistances[0]);
    390  TEXT_FRAGMENT_LOG("First word of start term: {}",
    391                    NS_ConvertUTF16toUTF8(mFirstWordOfStartContent));
    392  if (mStartWordEndDistances[0] == mStartContent.Length()) {
    393    mStartFirstWordLengthIncludingWhitespace = mStartContent.Length();
    394    mStartWordEndDistances.Clear();
    395    TEXT_FRAGMENT_LOG("Start term cannot be extended.");
    396  } else {
    397    mStartFirstWordLengthIncludingWhitespace =
    398        TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
    399            TextScanDirection::Right>(mStartFoldCaseContent,
    400                                      mStartWordEndDistances);
    401    TEXT_FRAGMENT_LOG(
    402        "Word end distances for start term, starting at the beginning of the "
    403        "second word: {}",
    404        mStartWordEndDistances);
    405  }
    406 
    407  MOZ_DIAGNOSTIC_ASSERT(!mEndContent.IsEmpty());
    408  mEndWordBeginDistances =
    409      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
    410          mEndContent);
    411  MOZ_DIAGNOSTIC_ASSERT(!mEndWordBeginDistances.IsEmpty(),
    412                        "There must be at least one word in the end term.");
    413  MOZ_DIAGNOSTIC_ASSERT(mEndWordBeginDistances[0] > 0);
    414  mLastWordOfEndContent =
    415      Substring(mEndContent, mEndContent.Length() - mEndWordBeginDistances[0]);
    416  TEXT_FRAGMENT_LOG("Last word of end term: {}",
    417                    NS_ConvertUTF16toUTF8(mLastWordOfEndContent));
    418  if (mEndWordBeginDistances[0] == mEndContent.Length()) {
    419    mEndLastWordLengthIncludingWhitespace = mEndContent.Length();
    420    mEndWordBeginDistances.Clear();
    421    TEXT_FRAGMENT_LOG("End term cannot be extended.");
    422  } else {
    423    mEndLastWordLengthIncludingWhitespace =
    424        TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
    425            TextScanDirection::Left>(mEndFoldCaseContent,
    426                                     mEndWordBeginDistances);
    427    TEXT_FRAGMENT_LOG(
    428        "Word begin distances for end term, starting at the end of the second "
    429        "last word: {}",
    430        mEndWordBeginDistances);
    431  }
    432 
    433  mSuffixWordEndDistances =
    434      TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
    435          mSuffixContent);
    436  TEXT_FRAGMENT_LOG("Word end distances for suffix term: {}",
    437                    mSuffixWordEndDistances);
    438 }
    439 
    440 Result<nsTArray<RefPtr<AbstractRange>>, ErrorResult>
    441 TextDirectiveCreator::FindAllMatchingRanges(const nsString& aSearchQuery,
    442                                            const RangeBoundary& aSearchStart,
    443                                            const RangeBoundary& aSearchEnd) {
    444  MOZ_ASSERT(!aSearchQuery.IsEmpty());
    445  RangeBoundary searchStart = aSearchStart;
    446  nsTArray<RefPtr<AbstractRange>> matchingRanges;
    447 
    448  while (true) {
    449    if (mWatchdog && mWatchdog->IsDone()) {
    450      return matchingRanges;
    451    }
    452    RefPtr<AbstractRange> searchResult = TextDirectiveUtil::FindStringInRange(
    453        mFinder, searchStart, aSearchEnd, aSearchQuery, true, true);
    454    if (!searchResult || searchResult->Collapsed()) {
    455      break;
    456    }
    457    searchStart = searchResult->StartRef();
    458    if (auto cmp = nsContentUtils::ComparePoints(searchStart, aSearchEnd,
    459                                                 &mNodeIndexCache);
    460        !cmp || *cmp != -1) {
    461      // this means hitting a bug in nsFind which apparently does not stop
    462      // exactly where it is told to. There are cases where it might
    463      // overshoot, e.g. if `aSearchEnd` is  a text node with offset=0.
    464      // However, due to reusing the cache used by nsFind this additional call
    465      // to ComparePoints should be very cheap.
    466      break;
    467    }
    468    matchingRanges.AppendElement(searchResult);
    469 
    470    MOZ_DIAGNOSTIC_ASSERT(searchResult->GetStartContainer()->IsText());
    471    auto newSearchStart =
    472        TextDirectiveUtil::MoveToNextBoundaryPoint(searchStart);
    473    MOZ_DIAGNOSTIC_ASSERT(newSearchStart != searchStart);
    474    searchStart = newSearchStart;
    475    if (auto cmp = nsContentUtils::ComparePoints(searchStart, aSearchEnd,
    476                                                 &mNodeIndexCache);
    477        !cmp || *cmp != -1) {
    478      break;
    479    }
    480  }
    481 
    482  TEXT_FRAGMENT_LOG(
    483      "Found {} matches for the input '{}' in the partial document.",
    484      matchingRanges.Length(), NS_ConvertUTF16toUTF8(aSearchQuery));
    485  return matchingRanges;
    486 }
    487 
    488 Result<Ok, ErrorResult>
    489 ExactMatchTextDirectiveCreator::FindAllMatchingCandidates() {
    490  if (MOZ_UNLIKELY(mRange->Collapsed())) {
    491    return Ok();
    492  }
    493 
    494  TEXT_FRAGMENT_LOG(
    495      "Searching all occurrences of range content ({}) in the partial document "
    496      "from document begin to begin of target range.",
    497      NS_ConvertUTF16toUTF8(mStartContent));
    498  const nsTArray<RefPtr<AbstractRange>> matchRanges =
    499      MOZ_TRY(FindAllMatchingRanges(mStartContent, {mDocument, 0u},
    500                                    mRange->StartRef()));
    501  FindCommonSubstringLengths(matchRanges);
    502  return Ok();
    503 }
    504 
    505 void ExactMatchTextDirectiveCreator::FindCommonSubstringLengths(
    506    const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) {
    507  if (mWatchdog && mWatchdog->IsDone()) {
    508    return;
    509  }
    510  size_t loopCounter = 0;
    511  for (const auto& range : aMatchRanges) {
    512    TEXT_FRAGMENT_LOG("Computing common prefix substring length for match {}.",
    513                      ++loopCounter);
    514    const uint32_t commonPrefixLength =
    515        TextDirectiveUtil::ComputeCommonSubstringLength<
    516            TextScanDirection::Left>(
    517            mPrefixFoldCaseContent,
    518            TextDirectiveUtil::FindNextNonWhitespacePosition<
    519                TextScanDirection::Left>(range->StartRef()));
    520 
    521    TEXT_FRAGMENT_LOG("Computing common suffix substring length for match {}.",
    522                      loopCounter);
    523    const uint32_t commonSuffixLength =
    524        TextDirectiveUtil::ComputeCommonSubstringLength<
    525            TextScanDirection::Right>(
    526            mSuffixFoldCaseContent,
    527            TextDirectiveUtil::FindNextNonWhitespacePosition<
    528                TextScanDirection::Right>(range->EndRef()));
    529 
    530    mCommonSubstringLengths.EmplaceBack(commonPrefixLength, commonSuffixLength);
    531  }
    532 }
    533 
    534 Result<Ok, ErrorResult>
    535 RangeBasedTextDirectiveCreator::FindAllMatchingCandidates() {
    536  MOZ_DIAGNOSTIC_ASSERT(!mFirstWordOfStartContent.IsEmpty(),
    537                        "Minimal start content must not be empty.");
    538  MOZ_DIAGNOSTIC_ASSERT(!mLastWordOfEndContent.IsEmpty(),
    539                        "Minimal end content must not be empty.");
    540 
    541  TEXT_FRAGMENT_LOG(
    542      "Searching all occurrences of first word of start content ({}) in the "
    543      "partial document from document begin to begin of the target range.",
    544      NS_ConvertUTF16toUTF8(mFirstWordOfStartContent));
    545 
    546  const nsTArray<RefPtr<AbstractRange>> startContentRanges =
    547      MOZ_TRY(FindAllMatchingRanges(mFirstWordOfStartContent, {mDocument, 0u},
    548                                    mRange->StartRef()));
    549  FindStartMatchCommonSubstringLengths(startContentRanges);
    550 
    551  if (mWatchdog && mWatchdog->IsDone()) {
    552    return Ok();
    553  }
    554  TEXT_FRAGMENT_LOG(
    555      "Searching all occurrences of last word of end content ({}) in the "
    556      "partial document from beginning of the target range to the end of the "
    557      "target range, excluding the last word.",
    558      NS_ConvertUTF16toUTF8(mLastWordOfEndContent));
    559 
    560  auto searchEnd =
    561      TextDirectiveUtil::FindNextNonWhitespacePosition<TextScanDirection::Left>(
    562          mRange->EndRef());
    563  searchEnd =
    564      TextDirectiveUtil::FindWordBoundary<TextScanDirection::Left>(searchEnd);
    565 
    566  const nsTArray<RefPtr<AbstractRange>> endContentRanges =
    567      MOZ_TRY(FindAllMatchingRanges(mLastWordOfEndContent, mRange->StartRef(),
    568                                    searchEnd));
    569  FindEndMatchCommonSubstringLengths(endContentRanges);
    570  return Ok();
    571 }
    572 
    573 void RangeBasedTextDirectiveCreator::FindStartMatchCommonSubstringLengths(
    574    const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) {
    575  size_t loopCounter = 0;
    576  for (const auto& range : aMatchRanges) {
    577    if (mWatchdog && mWatchdog->IsDone()) {
    578      return;
    579    }
    580    ++loopCounter;
    581    TEXT_FRAGMENT_LOG(
    582        "Computing common prefix substring length for start match {}.",
    583        loopCounter);
    584    const uint32_t commonPrefixLength =
    585        TextDirectiveUtil::ComputeCommonSubstringLength<
    586            TextScanDirection::Left>(
    587            mPrefixFoldCaseContent,
    588            TextDirectiveUtil::FindNextNonWhitespacePosition<
    589                TextScanDirection::Left>(range->StartRef()));
    590    TEXT_FRAGMENT_LOG("Common prefix length: {}", commonPrefixLength);
    591 
    592    TEXT_FRAGMENT_LOG(
    593        "Computing common start substring length for start match {}.",
    594        loopCounter);
    595    const uint32_t commonStartLength =
    596        TextDirectiveUtil::ComputeCommonSubstringLength<
    597            TextScanDirection::Right>(
    598            mStartFoldCaseContent,
    599            TextDirectiveUtil::FindNextNonWhitespacePosition<
    600                TextScanDirection::Right>(range->EndRef()));
    601 
    602    TEXT_FRAGMENT_LOG("Common length: {}", commonStartLength);
    603    mStartMatchCommonSubstringLengths.EmplaceBack(commonPrefixLength,
    604                                                  commonStartLength);
    605  }
    606 }
    607 
    608 void RangeBasedTextDirectiveCreator::FindEndMatchCommonSubstringLengths(
    609    const nsTArray<RefPtr<AbstractRange>>& aMatchRanges) {
    610  size_t loopCounter = 0;
    611  for (const auto& range : aMatchRanges) {
    612    if (mWatchdog && mWatchdog->IsDone()) {
    613      return;
    614    }
    615    ++loopCounter;
    616    TEXT_FRAGMENT_LOG("Computing common end substring length for end match {}.",
    617                      loopCounter);
    618    const uint32_t commonEndLength =
    619        TextDirectiveUtil::ComputeCommonSubstringLength<
    620            TextScanDirection::Left>(
    621            mEndFoldCaseContent,
    622            TextDirectiveUtil::FindNextNonWhitespacePosition<
    623                TextScanDirection::Left>(range->StartRef()));
    624    TEXT_FRAGMENT_LOG("Common end term length: {}", commonEndLength);
    625    TEXT_FRAGMENT_LOG(
    626        "Computing common suffix substring length for end match {}.",
    627        loopCounter);
    628    const uint32_t commonSuffixLength =
    629        TextDirectiveUtil::ComputeCommonSubstringLength<
    630            TextScanDirection::Right>(
    631            mSuffixFoldCaseContent,
    632            TextDirectiveUtil::FindNextNonWhitespacePosition<
    633                TextScanDirection::Right>(range->EndRef()));
    634    TEXT_FRAGMENT_LOG("Common suffix length: {}", commonSuffixLength);
    635 
    636    mEndMatchCommonSubstringLengths.EmplaceBack(commonEndLength,
    637                                                commonSuffixLength);
    638  }
    639 }
    640 
    641 Result<nsCString, ErrorResult> TextDirectiveCreator::CreateTextDirective() {
    642  if (mWatchdog && mWatchdog->IsDone()) {
    643    TEXT_FRAGMENT_LOG("Hitting timeout.");
    644    return VoidCString();
    645  }
    646  if (mRange->Collapsed()) {
    647    TEXT_FRAGMENT_LOG("Input range collapsed.");
    648    return VoidCString();
    649  }
    650  if (mStartContent.IsEmpty()) {
    651    TEXT_FRAGMENT_LOG("Input range is empty.");
    652    return VoidCString();
    653  }
    654 
    655  if (const Maybe<TextDirective> textDirective = FindShortestCombination()) {
    656    nsCString textDirectiveString;
    657    DebugOnly<bool> ret =
    658        create_text_directive(&*textDirective, &textDirectiveString);
    659    MOZ_ASSERT(ret);
    660    TEXT_FRAGMENT_LOG("Created text directive: {}", textDirectiveString);
    661    return textDirectiveString;
    662  }
    663  TEXT_FRAGMENT_LOG(
    664      "It's not possible to create a text directive for the given range.");
    665  return nsCString{};
    666 }
    667 
    668 /*static*/ std::tuple<nsTArray<uint32_t>, nsTArray<uint32_t>>
    669 TextDirectiveCreator::ExtendSubstringLengthsToWordBoundaries(
    670    const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactSubstringLengths,
    671    const Span<const uint32_t>& aFirstWordPositions,
    672    const Span<const uint32_t>& aSecondWordPositions) {
    673  const auto getNextWordBoundaryPosition =
    674      [](const Span<const uint32_t>& distances, uint32_t length) {
    675        // Note: This algorithm works for word begins and word ends,
    676        //       since the position arrays for properties that go right-to-left
    677        //       (prefix, end) are reversed and start from the end of the
    678        //       strings.
    679        for (const uint32_t distance : distances) {
    680          if (distance > length) {
    681            return distance;
    682          }
    683        }
    684        return distances.IsEmpty() ? 0 : distances.at(distances.Length() - 1);
    685      };
    686 
    687  const auto hashSetToSortedArray = [](const nsTHashSet<uint32_t>& aHashSet) {
    688    AutoTArray<uint32_t, 64> array;
    689    for (auto value : aHashSet) {
    690      array.InsertElementSorted(value);
    691    }
    692    return array;
    693  };
    694  nsTHashSet<uint32_t> firstSet;
    695  nsTHashSet<uint32_t> secondSet;
    696  firstSet.Insert(0);
    697  secondSet.Insert(0);
    698  // This loop is O(n^2) in the worst case, but the number of
    699  // aFirstWordPositions and aSecondWordPositions is small (< 32).
    700  // Also, one of the purposes of this algorithm is to bucket the exact lengths
    701  // (which represent the amount of matches for the target range) into word
    702  // bounded lengths. This means that the number of unique word bounded lengths
    703  // is < 32.
    704  for (const auto& [first, second] : aExactSubstringLengths) {
    705    firstSet.Insert(getNextWordBoundaryPosition(aFirstWordPositions, first));
    706    secondSet.Insert(getNextWordBoundaryPosition(aSecondWordPositions, second));
    707  }
    708  return {hashSetToSortedArray(firstSet), hashSetToSortedArray(secondSet)};
    709 }
    710 
    711 /*static*/
    712 Maybe<std::tuple<uint32_t, uint32_t>>
    713 TextDirectiveCreator::CheckAllCombinations(
    714    const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactWordLengths,
    715    const nsTArray<uint32_t>& aFirstExtendedToWordBoundaries,
    716    const nsTArray<uint32_t>& aSecondExtendedToWordBoundaries) {
    717  nsTArray<std::tuple<uint32_t, uint32_t, uint32_t>> sortedCandidates;
    718  sortedCandidates.SetCapacity(aFirstExtendedToWordBoundaries.Length() *
    719                               aSecondExtendedToWordBoundaries.Length());
    720 
    721  // Create all combinations of the extended values and sort them by their
    722  // cost function value (sum of the two values).
    723  // The cost function value is used to sort the candidates, so that the
    724  // candidates with the lowest cost function value are checked first. Since the
    725  // algorithm searches for the shortest possible combination, it can return as
    726  // soon as it finds a valid combination.
    727  for (const uint32_t firstExtendedToWordBoundary :
    728       aFirstExtendedToWordBoundaries) {
    729    for (const uint32_t secondExtendedToWordBoundary :
    730         aSecondExtendedToWordBoundaries) {
    731      const uint32_t costFunctionValue =
    732          firstExtendedToWordBoundary + secondExtendedToWordBoundary;
    733      sortedCandidates.InsertElementSorted(
    734          std::tuple{firstExtendedToWordBoundary, secondExtendedToWordBoundary,
    735                     costFunctionValue},
    736          [](const auto& a, const auto& b) -> int {
    737            return std::get<2>(a) - std::get<2>(b);
    738          });
    739    }
    740  }
    741  for (auto [firstExtendedToWordBoundary, secondExtendedToWordBoundary,
    742             costFunctionValue] : sortedCandidates) {
    743    TEXT_FRAGMENT_LOG("Checking candidate ({},{}). Score: {}",
    744                      firstExtendedToWordBoundary, secondExtendedToWordBoundary,
    745                      costFunctionValue);
    746    const bool isInvalid = AnyOf(
    747        aExactWordLengths.begin(), aExactWordLengths.end(),
    748        [firstExtended = firstExtendedToWordBoundary,
    749         secondExtended = secondExtendedToWordBoundary](
    750            const std::tuple<uint32_t, uint32_t>& exactWordLengths) {
    751          const auto [firstExact, secondExact] = exactWordLengths;
    752          return firstExtended <= firstExact && secondExtended <= secondExact;
    753        });
    754    if (isInvalid) {
    755      TEXT_FRAGMENT_LOG(
    756          "Current candidate doesn't eliminate all matches. Discarding this "
    757          "candidate.");
    758      continue;
    759    }
    760    TEXT_FRAGMENT_LOG("Current candidate ({},{}) is the best candidate.",
    761                      firstExtendedToWordBoundary,
    762                      secondExtendedToWordBoundary);
    763    return Some(
    764        std::tuple{firstExtendedToWordBoundary, secondExtendedToWordBoundary});
    765  }
    766  return Nothing{};
    767 }
    768 
    769 Maybe<TextDirective> ExactMatchTextDirectiveCreator::FindShortestCombination()
    770    const {
    771  const auto [prefixLengths, suffixLengths] =
    772      TextDirectiveCreator::ExtendSubstringLengthsToWordBoundaries(
    773          mCommonSubstringLengths, mPrefixWordBeginDistances,
    774          mSuffixWordEndDistances);
    775  TEXT_FRAGMENT_LOG("Find shortest combination based on prefix and suffix.");
    776  TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}",
    777                    mCommonSubstringLengths.Length(),
    778                    prefixLengths.Length() * suffixLengths.Length());
    779  TEXT_FRAGMENT_LOG("Checking prefix lengths (extended to word boundaries): {}",
    780                    prefixLengths);
    781  TEXT_FRAGMENT_LOG("Checking suffix lengths (extended to word boundaries): {}",
    782                    suffixLengths);
    783  TEXT_FRAGMENT_LOG("Matches: {}", mCommonSubstringLengths);
    784  return CheckAllCombinations(mCommonSubstringLengths, prefixLengths,
    785                              suffixLengths)
    786      .andThen([&](std::tuple<uint32_t, uint32_t> bestMatch) {
    787        const auto [prefixLength, suffixLength] = bestMatch;
    788        TextDirective td;
    789        if (prefixLength) {
    790          td.prefix =
    791              Substring(mPrefixContent, mPrefixContent.Length() - prefixLength);
    792        }
    793        td.start = mStartContent;
    794        if (suffixLength) {
    795          td.suffix = Substring(mSuffixContent, 0, suffixLength);
    796        }
    797        return Some(td);
    798      });
    799 }
    800 
    801 Maybe<TextDirective> RangeBasedTextDirectiveCreator::FindShortestCombination()
    802    const {
    803  // For this algorithm, ignore the first word of the start term  and the last
    804  // word of the end term (which are required). This allows the optimization
    805  // algorithm to minimize to 0.
    806  auto [prefixLengths, startLengths] = ExtendSubstringLengthsToWordBoundaries(
    807      mStartMatchCommonSubstringLengths, mPrefixWordBeginDistances,
    808      mStartWordEndDistances);
    809 
    810  TEXT_FRAGMENT_LOG(
    811      "Find shortest combination for start match based on prefix and start");
    812  TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}",
    813                    mStartMatchCommonSubstringLengths.Length(),
    814                    prefixLengths.Length() * startLengths.Length());
    815  TEXT_FRAGMENT_LOG("Checking prefix lengths (extended to word boundaries): {}",
    816                    prefixLengths);
    817  TEXT_FRAGMENT_LOG("Checking start lengths (extended to word boundaries): {}",
    818                    startLengths);
    819  TEXT_FRAGMENT_LOG("Matches: {}", mStartMatchCommonSubstringLengths);
    820 
    821  const auto bestStartMatch = CheckAllCombinations(
    822      mStartMatchCommonSubstringLengths, prefixLengths, startLengths);
    823  if (MOZ_UNLIKELY(bestStartMatch.isNothing())) {
    824    TEXT_FRAGMENT_LOG(
    825        "Could not find unique start match. It's not possible to create a text "
    826        "directive for the target range.");
    827    return Nothing{};
    828  }
    829  auto [endLengths, suffixLengths] = ExtendSubstringLengthsToWordBoundaries(
    830      mEndMatchCommonSubstringLengths, mEndWordBeginDistances,
    831      mSuffixWordEndDistances);
    832 
    833  TEXT_FRAGMENT_LOG(
    834      "Find shortest combination for end match based on end and suffix");
    835  TEXT_FRAGMENT_LOG("Matches to eliminate: {}, Total combinations: {}",
    836                    mEndMatchCommonSubstringLengths.Length(),
    837                    endLengths.Length() * suffixLengths.Length());
    838  TEXT_FRAGMENT_LOG("Checking end lengths (extended to word boundaries): {}",
    839                    endLengths);
    840  TEXT_FRAGMENT_LOG("Checking suffix lengths (extended to word boundaries): {}",
    841                    suffixLengths);
    842  TEXT_FRAGMENT_LOG("Matches: {}", mEndMatchCommonSubstringLengths);
    843  const auto bestEndMatch = CheckAllCombinations(
    844      mEndMatchCommonSubstringLengths, endLengths, suffixLengths);
    845  if (MOZ_UNLIKELY(bestEndMatch.isNothing())) {
    846    TEXT_FRAGMENT_LOG(
    847        "Could not find unique end match. It's not possible to create a text "
    848        "directive for the target range.");
    849    return Nothing{};
    850  }
    851  const auto [prefixLength, startLength] = *bestStartMatch;
    852  const auto [endLength, suffixLength] = *bestEndMatch;
    853  TextDirective td;
    854  if (prefixLength) {
    855    td.prefix =
    856        Substring(mPrefixContent, mPrefixContent.Length() - prefixLength);
    857  }
    858 
    859  if (startLength) {
    860    const uint32_t startLengthIncludingFirstWord =
    861        mStartFirstWordLengthIncludingWhitespace + startLength;
    862    MOZ_DIAGNOSTIC_ASSERT(startLengthIncludingFirstWord <=
    863                          mStartContent.Length());
    864    td.start = Substring(mStartContent, 0, startLengthIncludingFirstWord);
    865  } else {
    866    td.start = mFirstWordOfStartContent;
    867  }
    868  if (endLength) {
    869    const uint32_t endLengthIncludingLastWord =
    870        mEndLastWordLengthIncludingWhitespace + endLength;
    871 
    872    MOZ_DIAGNOSTIC_ASSERT(endLengthIncludingLastWord <= mEndContent.Length());
    873    td.end = Substring(mEndContent,
    874                       mEndContent.Length() - endLengthIncludingLastWord);
    875  } else {
    876    td.end = mLastWordOfEndContent;
    877  }
    878 
    879  if (suffixLength) {
    880    td.suffix = Substring(mSuffixContent, 0, suffixLength);
    881  }
    882 
    883  return Some(td);
    884 }
    885 
    886 }  // namespace mozilla::dom
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE