tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

TextDirectiveCreator.h (12574B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef DOM_TEXTDIRECTIVECREATOR_H_
      8 #define DOM_TEXTDIRECTIVECREATOR_H_
      9 
     10 #include <tuple>
     11 
     12 #include "RangeBoundary.h"
     13 #include "TextDirectiveUtil.h"
     14 #include "mozilla/RefPtr.h"
     15 #include "mozilla/Result.h"
     16 #include "mozilla/dom/fragmentdirectives_ffi_generated.h"
     17 #include "nsStringFwd.h"
     18 
     19 class nsRange;
     20 
     21 namespace mozilla {
     22 class ErrorResult;
     23 }
     24 
     25 namespace mozilla::dom {
     26 class Document;
     27 /**
     28 * @brief Helper class to create a text directive string from a given `Range`.
     29 *
     30 * The class provides a public static creator function which encapsulates all
     31 * necessary logic.
     32 * This class serves as a base class that defines the main algorithm, and is
     33 * subclassed twice for exact and range-based matching.
     34 */
     35 class TextDirectiveCreator {
     36 public:
     37  /**
     38   * @brief Static creator function. Takes a `Range` and creates a text
     39   *        directive string, if possible.
     40   *
     41   * @param aDocument   The document in which `aInputRange` lives.
     42   * @param aInputRange The input range. This range will not be modified.
     43   * @param aWatchdog   A watchdog to ensure the operation does not run
     44   *                    longer than the predefined timeout.
     45   *
     46   * @return Returns a percent-encoded text directive string on success, an
     47   *         empty string if it's not possible to create a text fragment for the
     48   *         given range, or an error code.
     49   */
     50  static Result<nsCString, ErrorResult> CreateTextDirectiveFromRange(
     51      Document* aDocument, AbstractRange* aInputRange,
     52      const TimeoutWatchdog* aWatchdog);
     53 
     54  virtual ~TextDirectiveCreator();
     55 
     56 protected:
     57  TextDirectiveCreator(Document* aDocument, AbstractRange* aRange,
     58                       const TimeoutWatchdog* aWatchdog);
     59 
     60  /**
     61   * @brief Ensures the boundary points of the range point to word boundaries.
     62   *
     63   * This function always returns a new range.
     64   */
     65  static Result<RefPtr<AbstractRange>, ErrorResult> ExtendRangeToWordBoundaries(
     66      AbstractRange* aRange);
     67 
     68  /**
     69   * @brief Determines whether exact or range-based matching should be used.
     70   *
     71   * This function searches for a block boundary in `aRange`, which requires
     72   * range-based matching. If there is no block boundary, but the range content
     73   * is longer than a threshold, range-based matching is used as well.
     74   * This threshold is defined by the pref
     75   * `dom.text_fragments.create_text_fragment.exact_match_max_length`.
     76   *
     77   */
     78  static Result<bool, ErrorResult> MustUseRangeBasedMatching(
     79      AbstractRange* aRange);
     80 
     81  /**
     82   * @brief Creates an instance either for exact or range-based matching.
     83   */
     84  static Result<UniquePtr<TextDirectiveCreator>, ErrorResult> CreateInstance(
     85      Document* aDocument, AbstractRange* aRange,
     86      const TimeoutWatchdog* aWatchdog);
     87 
     88  /**
     89   * @brief Collects text content surrounding the target range.
     90   *
     91   * The context terms are then stored both in normal and fold case form.
     92   *
     93   * Returns false if the algorithm cannot continue, for example if the text
     94   * directive must use range-based matching because of its length, but the
     95   * target range only consists of one word.
     96   */
     97  virtual Result<bool, ErrorResult> CollectContextTerms() = 0;
     98 
     99  /**
    100   * @brief Common helper which collects the prefix term of the target range.
    101   */
    102  Result<Ok, ErrorResult> CollectPrefixContextTerm();
    103 
    104  /**
    105   * @brief Common helper which collects the suffix term of the target range.
    106   */
    107  Result<Ok, ErrorResult> CollectSuffixContextTerm();
    108 
    109  /**
    110   * @brief Collect the word begin / word end distances for the context terms.
    111   *
    112   * For start (for range-based matching) and suffix terms, the search direction
    113   * is left-to-right. Therefore, the distances are based off the beginning of
    114   * the context terms and use the word end boundary.
    115   *
    116   * For prefix and end (for range-based matching), the search direction is
    117   * right-to-left. Therefore, the distances are based off the end of the
    118   * context terms and use the word start boundary.
    119   *
    120   * The distances are always sorted, so that the first entry points to the
    121   * nearest word boundary in search direction.
    122   */
    123  virtual void CollectContextTermWordBoundaryDistances() = 0;
    124 
    125  /**
    126   * @brief Searches the document for other occurrences of the target range and
    127   *        converts the results into a comparable format.
    128   *
    129   * This method searches the partial document from the beginning up to the
    130   * target range for occurrences of the target range content.
    131   * This needs to be done differently based on whether matching is exact or
    132   * range-based. For exact matching, the whole text content of the target range
    133   * is searched for. For range-based matching, two search runs are required:
    134   * One for the minimal `start` term (ie., the first word), which ends at the
    135   * beginning of the target range. And one for the minimal `end` term (ie., the
    136   * last word), which starts at the beginning of the target range and ends
    137   * _before_ its end.
    138   * The resulting lists of matching ranges do not exclude the target range.
    139   */
    140  virtual Result<Ok, ErrorResult> FindAllMatchingCandidates() = 0;
    141 
    142  /**
    143   * @brief Find all occurrences of `aSearchQuery` in the partial document.
    144   *
    145   * This method uses `nsFind` to perform a case-insensitive search for
    146   * `aSearchQuery` in the partial document from `aSearchStart` to `aSearchEnd`.
    147   *
    148   * @return List of `Range`s which have the case-insensitive-same content as
    149   *         `aSearchQuery`.
    150   */
    151  Result<nsTArray<RefPtr<AbstractRange>>, ErrorResult> FindAllMatchingRanges(
    152      const nsString& aSearchQuery, const RangeBoundary& aSearchStart,
    153      const RangeBoundary& aSearchEnd);
    154 
    155  /**
    156   * @brief Creates the shortest possible text directive.
    157   *
    158   * @return A percent-encoded string containing a text directive. Returns empty
    159   *         string in cases where it's not possible to create a text directive.
    160   */
    161  Result<nsCString, ErrorResult> CreateTextDirective();
    162 
    163  /**
    164   * @brief Creates unique substring length arrays which are extended to the
    165   *        nearest word boundary.
    166   */
    167  static std::tuple<nsTArray<uint32_t>, nsTArray<uint32_t>>
    168  ExtendSubstringLengthsToWordBoundaries(
    169      const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactSubstringLengths,
    170      const Span<const uint32_t>& aFirstWordPositions,
    171      const Span<const uint32_t>& aSecondWordPositions);
    172 
    173  /**
    174   * @brief Test all combinations to identify the shortest text directive.
    175   */
    176  virtual Maybe<TextDirective> FindShortestCombination() const = 0;
    177 
    178  /**
    179   * @brief Perform a brute-force optimization run to find the shortest
    180   *        combination of a combination of two context terms.
    181   *
    182   * Each combination of the extended values is compared against all exact
    183   * values. It is only considered valid if at least one value is longer than
    184   * the exact lengths.
    185   *
    186   * @param aExactWordLengths               Array of tuples containing the exact
    187   *                                        common sub string lengths of this
    188   *                                        combination.
    189   * @param aFirstExtendedToWordBoundaries  All valid substring lengths for the
    190   *                                        first context term, extended to its
    191   *                                        next word boundary in reading
    192   *                                        direction.
    193   * @param aSecondExtendedToWordBoundaries All valid substring lengths for the
    194   *                                        second context term, extended to its
    195   *                                        next word boundary in reading
    196   *                                        direction.
    197   * @return A tuple of sub string lengths extended to word boundaries, which is
    198   *         the shortest allowed combination to eliminate all matches.
    199   *         Returns `Nothing` if it's not possible to eliminate all matches.
    200   */
    201  static Maybe<std::tuple<uint32_t, uint32_t>> CheckAllCombinations(
    202      const nsTArray<std::tuple<uint32_t, uint32_t>>& aExactWordLengths,
    203      const nsTArray<uint32_t>& aFirstExtendedToWordBoundaries,
    204      const nsTArray<uint32_t>& aSecondExtendedToWordBoundaries);
    205 
    206  // The maximum length of the context terms around the target range.
    207  // If a context term is longer, it will be truncated to this length.
    208  // If -- which seems highly unlikely -- there is another match for the target
    209  // range which happens to have a context term with the same content, it will
    210  // essentially be ignored:
    211  // The common length would be equal to this value, also the maximum common
    212  // length extended to the word boundary. Therefore, the algorithm could not
    213  // find a candidate which exceeds this length, therefore ignoring it
    214  // altogether.
    215  //
    216  // If -- even more unlikely -- this condition would happen for _every_ context
    217  // term, the algorithm would determine that it cannot create a text
    218  // directive for the target range because it would be ambiguous.
    219  static constexpr uint32_t kMaxContextTermLength = 1024;
    220 
    221  nsString mPrefixContent;
    222  nsString mPrefixFoldCaseContent;
    223  nsTArray<uint32_t> mPrefixWordBeginDistances;
    224 
    225  nsString mStartContent;
    226 
    227  nsString mSuffixContent;
    228  nsString mSuffixFoldCaseContent;
    229  nsTArray<uint32_t> mSuffixWordEndDistances;
    230 
    231  NotNull<RefPtr<Document>> mDocument;
    232  NotNull<RefPtr<AbstractRange>> mRange;
    233 
    234  NotNull<RefPtr<nsFind>> mFinder;
    235 
    236  /**
    237   * The watchdog ensures that the algorithm exits after a defined time
    238   * duration, to ensure that the main thread is not blocked for too long.
    239   *
    240   * The duration is defined by the pref
    241   * `dom.text_fragments.create_text_fragment.timeout`.
    242   */
    243  RefPtr<const TimeoutWatchdog> mWatchdog;
    244 
    245  nsContentUtils::NodeIndexCache mNodeIndexCache;
    246 };
    247 
    248 /**
    249 * @brief Creator class which creates a range-based text directive.
    250 *
    251 */
    252 class RangeBasedTextDirectiveCreator : public TextDirectiveCreator {
    253 private:
    254  using TextDirectiveCreator::TextDirectiveCreator;
    255 
    256  Result<bool, ErrorResult> CollectContextTerms() override;
    257 
    258  void CollectContextTermWordBoundaryDistances() override;
    259 
    260  Result<Ok, ErrorResult> FindAllMatchingCandidates() override;
    261 
    262  void FindStartMatchCommonSubstringLengths(
    263      const nsTArray<RefPtr<AbstractRange>>& aMatchRanges);
    264 
    265  void FindEndMatchCommonSubstringLengths(
    266      const nsTArray<RefPtr<AbstractRange>>& aMatchRanges);
    267 
    268  Maybe<TextDirective> FindShortestCombination() const override;
    269 
    270  nsString mEndContent;
    271  // The fold case contents for start and end terms don't include the first/last
    272  // word of the start and end terms, because they are only used for finding the
    273  // common lengths for other matches.
    274  nsString mStartFoldCaseContent;
    275  nsString mEndFoldCaseContent;
    276 
    277  // These values are only passed into nsFind, therefore fold case is not
    278  // required.
    279  nsString mFirstWordOfStartContent;
    280  nsString mLastWordOfEndContent;
    281 
    282  // The lengths of the first/last word of the start and end terms, including
    283  // whitespace to the next word.
    284  // Therefore, these values are equal to
    285  // `m[Start|End]Content.Length() - m[Start|End]FoldCaseContent.Length()`.
    286  uint32_t mStartFirstWordLengthIncludingWhitespace = 0;
    287  uint32_t mEndLastWordLengthIncludingWhitespace = 0;
    288 
    289  // The distances are bound to the Fold Case Content strings, which do not
    290  // include the first/last word of the start and end terms.
    291  nsTArray<uint32_t> mStartWordEndDistances;
    292  nsTArray<uint32_t> mEndWordBeginDistances;
    293 
    294  nsTArray<std::tuple<uint32_t, uint32_t>> mStartMatchCommonSubstringLengths;
    295  nsTArray<std::tuple<uint32_t, uint32_t>> mEndMatchCommonSubstringLengths;
    296 };
    297 
    298 /**
    299 * @brief Creator class which creates an exact match text directive.
    300 *
    301 */
    302 class ExactMatchTextDirectiveCreator : public TextDirectiveCreator {
    303 private:
    304  using TextDirectiveCreator::TextDirectiveCreator;
    305 
    306  Result<bool, ErrorResult> CollectContextTerms() override;
    307 
    308  void CollectContextTermWordBoundaryDistances() override;
    309 
    310  Result<Ok, ErrorResult> FindAllMatchingCandidates() override;
    311 
    312  void FindCommonSubstringLengths(
    313      const nsTArray<RefPtr<AbstractRange>>& aMatchRanges);
    314 
    315  Maybe<TextDirective> FindShortestCombination() const override;
    316 
    317  nsTArray<std::tuple<uint32_t, uint32_t>> mCommonSubstringLengths;
    318 };
    319 }  // namespace mozilla::dom
    320 #endif