tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mozInlineSpellWordUtil.cpp (41705B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 #include "mozInlineSpellWordUtil.h"
      7 
      8 #include <algorithm>
      9 #include <utility>
     10 
     11 #include "mozilla/BinarySearch.h"
     12 #include "mozilla/EditorBase.h"
     13 #include "mozilla/HTMLEditor.h"
     14 #include "mozilla/Logging.h"
     15 #include "mozilla/dom/CharacterDataBuffer.h"
     16 #include "mozilla/dom/Element.h"
     17 
     18 #include "nsDebug.h"
     19 #include "nsAtom.h"
     20 #include "nsComponentManagerUtils.h"
     21 #include "nsUnicodeProperties.h"
     22 #include "nsServiceManagerUtils.h"
     23 #include "nsIContent.h"
     24 #include "nsRange.h"
     25 #include "nsContentUtils.h"
     26 #include "nsIFrame.h"
     27 
     28 using namespace mozilla;
     29 
     30 static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};
     31 
     32 // IsIgnorableCharacter
     33 //
     34 //    These characters are ones that we should ignore in input.
     35 
     36 inline bool IsIgnorableCharacter(char ch) {
     37  return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN
     38 }
     39 
     40 inline bool IsIgnorableCharacter(char16_t ch) {
     41  return (ch == 0xAD ||   // SOFT HYPHEN
     42          ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
     43 }
     44 
     45 // IsConditionalPunctuation
     46 //
     47 //    Some characters (like apostrophes) require characters on each side to be
     48 //    part of a word, and are otherwise punctuation.
     49 
     50 inline bool IsConditionalPunctuation(char ch) {
     51  return (ch == '\'' ||                    // RIGHT SINGLE QUOTATION MARK
     52          ch == static_cast<char>(0xB7));  // MIDDLE DOT
     53 }
     54 
     55 inline bool IsConditionalPunctuation(char16_t ch) {
     56  return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK
     57          ch == 0x00B7);                 // MIDDLE DOT
     58 }
     59 
     60 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
     61  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
     62  return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
     63          IsConditionalPunctuation(ch));
     64 }
     65 
     66 static bool IsAmbiguousDOMWordSeprator(char ch) {
     67  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
     68  return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
     69 }
     70 
     71 // IsDOMWordSeparator
     72 //
     73 //    Determines if the given character should be considered as a DOM Word
     74 //    separator. Basically, this is whitespace, although it could also have
     75 //    certain punctuation that we know ALWAYS breaks words. This is important.
     76 //    For example, we can't have any punctuation that could appear in a URL
     77 //    or email address in this, because those need to always fit into a single
     78 //    DOM word.
     79 
     80 static bool IsDOMWordSeparator(char ch) {
     81  // simple spaces or no-break space
     82  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
     83          ch == static_cast<char>(0xA0));
     84 }
     85 
     86 static bool IsDOMWordSeparator(char16_t ch) {
     87  // simple spaces
     88  if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
     89 
     90  // complex spaces - check only if char isn't ASCII (uncommon)
     91  if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE
     92                     ch == 0x2002 ||  // EN SPACE
     93                     ch == 0x2003 ||  // EM SPACE
     94                     ch == 0x2009 ||  // THIN SPACE
     95                     ch == 0x3000))   // IDEOGRAPHIC SPACE
     96    return true;
     97 
     98  // otherwise not a space
     99  return false;
    100 }
    101 
    102 bool NodeOffset::operator==(
    103    const mozilla::RangeBoundary& aRangeBoundary) const {
    104  if (aRangeBoundary.GetContainer() != mNode) {
    105    return false;
    106  }
    107 
    108  const Maybe<uint32_t> rangeBoundaryOffset =
    109      aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);
    110 
    111  MOZ_ASSERT(mOffset >= 0);
    112  return rangeBoundaryOffset &&
    113         (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));
    114 }
    115 
    116 bool NodeOffsetRange::operator==(const nsRange& aRange) const {
    117  return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();
    118 }
    119 
    120 // static
    121 Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
    122    const EditorBase& aEditorBase) {
    123  dom::Document* document = aEditorBase.GetDocument();
    124  if (NS_WARN_IF(!document)) {
    125    return Nothing();
    126  }
    127 
    128  const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();
    129 
    130  // Find the root node for the editor. For contenteditable the mRootNode could
    131  // change to shadow root if the begin and end are inside the shadowDOM.
    132  nsINode* rootNode = aEditorBase.GetRoot();
    133  if (NS_WARN_IF(!rootNode)) {
    134    return Nothing();
    135  }
    136 
    137  mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,
    138                              *rootNode};
    139  return Some(std::move(util));
    140 }
    141 
    142 static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
    143  nsIContent* parent = aNode->GetParent();
    144  if (parent &&
    145      parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
    146    return false;
    147  return aNode->IsText();
    148 }
    149 
    150 typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
    151 
    152 // Find the next node in the DOM tree in preorder.
    153 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
    154 // why we can't just use GetNextNode here, sadly.
    155 static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,
    156                             OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
    157  MOZ_ASSERT(aNode, "Null starting node?");
    158 
    159  nsINode* next = aNode->GetFirstChild();
    160  if (next) return next;
    161 
    162  // Don't look at siblings or otherwise outside of aRoot
    163  if (aNode == aRoot) return nullptr;
    164 
    165  next = aNode->GetNextSibling();
    166  if (next) return next;
    167 
    168  // Go up
    169  for (;;) {
    170    if (aOnLeaveNode) {
    171      aOnLeaveNode(aNode, aClosure);
    172    }
    173 
    174    next = aNode->GetParent();
    175    if (next == aRoot || !next) return nullptr;
    176    aNode = next;
    177 
    178    next = aNode->GetNextSibling();
    179    if (next) return next;
    180  }
    181 }
    182 
    183 // aNode is not a text node. Find the first text node starting at aNode/aOffset
    184 // in a preorder DOM traversal.
    185 static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
    186                                 const nsINode* aRoot) {
    187  MOZ_ASSERT(aNode, "Null starting node?");
    188  MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),
    189             "FindNextTextNode should start with a non-text node");
    190 
    191  nsINode* checkNode;
    192  // Need to start at the aOffset'th child
    193  nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);
    194 
    195  if (child) {
    196    checkNode = child;
    197  } else {
    198    // aOffset was beyond the end of the child list.
    199    // goto next node after the last descendant of aNode in
    200    // a preorder DOM traversal.
    201    checkNode = aNode->GetNextNonChildNode(aRoot);
    202  }
    203 
    204  while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
    205    checkNode = checkNode->GetNextNode(aRoot);
    206  }
    207  return checkNode;
    208 }
    209 
    210 // mozInlineSpellWordUtil::SetPositionAndEnd
    211 //
    212 //    We have two ranges "hard" and "soft". The hard boundary is simply
    213 //    the scope of the root node. The soft boundary is that which is set
    214 //    by the caller of this class by calling this function. If this function is
    215 //    not called, the soft boundary is the same as the hard boundary.
    216 //
    217 //    When we reach the soft boundary (mSoftText.GetEnd()), we keep
    218 //    going until we reach the end of a word. This allows the caller to set the
    219 //    end of the range to anything, and we will always check whole multiples of
    220 //    words. When we reach the hard boundary we stop no matter what.
    221 //
    222 //    There is no beginning soft boundary. This is because we only go to the
    223 //    previous node once, when finding the previous word boundary in
    224 //    SetPosition(). You might think of the soft boundary as being this initial
    225 //    position.
    226 
    227 nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
    228                                                   int32_t aPositionOffset,
    229                                                   nsINode* aEndNode,
    230                                                   int32_t aEndOffset) {
    231  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
    232          ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
    233           aPositionOffset, aEndNode, aEndOffset));
    234 
    235  MOZ_ASSERT(aPositionNode, "Null begin node?");
    236  MOZ_ASSERT(aEndNode, "Null end node?");
    237 
    238  MOZ_ASSERT(mRootNode, "Not initialized");
    239 
    240  // Find a appropriate root if we are dealing with contenteditable nodes which
    241  // are in the shadow DOM.
    242  if (mIsContentEditableOrDesignMode) {
    243    nsINode* rootNode = aPositionNode->SubtreeRoot();
    244    if (rootNode != aEndNode->SubtreeRoot()) {
    245      return NS_ERROR_FAILURE;
    246    }
    247 
    248    if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
    249      mRootNode = rootNode;
    250    }
    251  }
    252 
    253  mSoftText.Invalidate();
    254 
    255  if (!IsSpellCheckingTextNode(aPositionNode)) {
    256    // Start at the start of the first text node after aNode/aOffset.
    257    aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
    258    aPositionOffset = 0;
    259  }
    260  NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);
    261 
    262  if (!IsSpellCheckingTextNode(aEndNode)) {
    263    // End at the start of the first text node after aEndNode/aEndOffset.
    264    aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
    265    aEndOffset = 0;
    266  }
    267  NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);
    268 
    269  nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
    270  if (NS_FAILED(rv)) {
    271    return rv;
    272  }
    273 
    274  int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());
    275  if (textOffset < 0) {
    276    return NS_OK;
    277  }
    278 
    279  mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
    280  return NS_OK;
    281 }
    282 
    283 nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,
    284                                             NodeOffset aSoftEnd) {
    285  if (mSoftText.mIsValid) return NS_OK;
    286  mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),
    287                                    mRootNode);
    288 
    289  mRealWords.Clear();
    290  Result<RealWords, nsresult> realWords = BuildRealWords();
    291  if (realWords.isErr()) {
    292    return realWords.unwrapErr();
    293  }
    294 
    295  mRealWords = realWords.unwrap();
    296  mSoftText.mIsValid = true;
    297  return NS_OK;
    298 }
    299 
    300 nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
    301                                                  nsRange** aRange) const {
    302  NodeOffset begin =
    303      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
    304  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
    305  return MakeRange(begin, end, aRange);
    306 }
    307 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
    308    const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
    309  NodeOffset begin =
    310      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
    311  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
    312  *aNodeOffsetRange = NodeOffsetRange(begin, end);
    313 }
    314 
    315 // mozInlineSpellWordUtil::GetRangeForWord
    316 
    317 nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
    318                                                 int32_t aWordOffset,
    319                                                 nsRange** aRange) {
    320  // Set our soft end and start
    321  NodeOffset pt(aWordNode, aWordOffset);
    322 
    323  if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||
    324      pt != mSoftText.GetEnd()) {
    325    mSoftText.Invalidate();
    326    NodeOffset softBegin = pt;
    327    NodeOffset softEnd = pt;
    328    nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
    329    if (NS_FAILED(rv)) {
    330      return rv;
    331    }
    332  }
    333 
    334  int32_t offset = MapDOMPositionToSoftTextOffset(pt);
    335  if (offset < 0) return MakeRange(pt, pt, aRange);
    336  int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
    337  if (wordIndex < 0) return MakeRange(pt, pt, aRange);
    338  return MakeRangeForWord(mRealWords[wordIndex], aRange);
    339 }
    340 
    341 // This is to fix characters that the spellchecker may not like
    342 static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
    343                          nsAString& aOutput) {
    344  aOutput.Truncate();
    345  for (int32_t i = 0; i < aLen; i++) {
    346    char16_t ch = aInput.CharAt(i + aPos);
    347 
    348    // remove ignorable characters from the word
    349    if (IsIgnorableCharacter(ch)) continue;
    350 
    351    // the spellchecker doesn't handle curly apostrophes in all languages
    352    if (ch == 0x2019) {  // RIGHT SINGLE QUOTATION MARK
    353      ch = '\'';
    354    }
    355 
    356    aOutput.Append(ch);
    357  }
    358 }
    359 
    360 // mozInlineSpellWordUtil::GetNextWord
    361 //
    362 //    FIXME-optimization: we shouldn't have to generate a range every single
    363 //    time. It would be better if the inline spellchecker didn't require a
    364 //    range unless the word was misspelled. This may or may not be possible.
    365 
    366 bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {
    367  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
    368          ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));
    369 
    370  if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
    371    mNextWordIndex = -1;
    372    aWord.mSkipChecking = true;
    373    return false;
    374  }
    375 
    376  const RealWord& realWord = mRealWords[mNextWordIndex];
    377  MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);
    378  ++mNextWordIndex;
    379  aWord.mSkipChecking = !realWord.mCheckableWord;
    380  ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,
    381                  realWord.mLength, aWord.mText);
    382 
    383  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
    384          ("%s: returning: %s (skip=%d)", __FUNCTION__,
    385           NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));
    386 
    387  return true;
    388 }
    389 
    390 // mozInlineSpellWordUtil::MakeRange
    391 //
    392 //    Convenience function for creating a range over the current document.
    393 
    394 nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
    395                                           nsRange** aRange) const {
    396  NS_ENSURE_ARG_POINTER(aBegin.mNode);
    397  if (!mDocument) {
    398    return NS_ERROR_NOT_INITIALIZED;
    399  }
    400 
    401  ErrorResult error;
    402  RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
    403                                          aEnd.mNode, aEnd.mOffset, error);
    404  if (NS_WARN_IF(error.Failed())) {
    405    return error.StealNSResult();
    406  }
    407  MOZ_ASSERT(range);
    408  range.forget(aRange);
    409  return NS_OK;
    410 }
    411 
    412 // static
    413 already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
    414    const NodeOffsetRange& aRange) {
    415  IgnoredErrorResult ignoredError;
    416  RefPtr<nsRange> range =
    417      nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
    418                      aRange.End().Node(), aRange.End().Offset(), ignoredError);
    419  NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
    420  return range.forget();
    421 }
    422 
    423 /*********** Word Splitting ************/
    424 
    425 // classifies a given character in the DOM word
    426 enum CharClass {
    427  CHAR_CLASS_WORD,
    428  CHAR_CLASS_SEPARATOR,
    429  CHAR_CLASS_END_OF_INPUT
    430 };
    431 
    432 // Encapsulates DOM-word to real-word splitting
    433 template <class T>
    434 struct MOZ_STACK_CLASS WordSplitState {
    435  const T& mDOMWordText;
    436  int32_t mDOMWordOffset;
    437  CharClass mCurCharClass;
    438 
    439  explicit WordSplitState(const T& aString)
    440      : mDOMWordText(aString),
    441        mDOMWordOffset(0),
    442        mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
    443 
    444  CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
    445  void Advance();
    446  void AdvanceThroughSeparators();
    447  void AdvanceThroughWord();
    448 
    449  // Finds special words like email addresses and URLs that may start at the
    450  // current position, and returns their length, or 0 if not found. This allows
    451  // arbitrary word breaking rules to be used for these special entities, as
    452  // long as they can not contain whitespace.
    453  bool IsSpecialWord() const;
    454 
    455  // Similar to IsSpecialWord except that this takes a split word as
    456  // input. This checks for things that do not require special word-breaking
    457  // rules.
    458  bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
    459 
    460  // Finds the last sequence of DOM word separators before aBeforeOffset and
    461  // returns the offset to its first element.
    462  Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
    463      int32_t aBeforeOffset) const;
    464 
    465  char16_t GetUnicharAt(int32_t aIndex) const;
    466 };
    467 
    468 // WordSplitState::ClassifyCharacter
    469 template <class T>
    470 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
    471                                               bool aRecurse) const {
    472  MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
    473             "Index out of range");
    474  if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
    475 
    476  // this will classify the character, we want to treat "ignorable" characters
    477  // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
    478  nsUGenCategory charCategory =
    479      mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
    480  if (charCategory == nsUGenCategory::kLetter ||
    481      IsIgnorableCharacter(mDOMWordText[aIndex]) ||
    482      mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
    483      mDOMWordText[aIndex] == 0x200D /* ZWJ */)
    484    return CHAR_CLASS_WORD;
    485 
    486  // If conditional punctuation is surrounded immediately on both sides by word
    487  // characters it also counts as a word character.
    488  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
    489    if (!aRecurse) {
    490      // not allowed to look around, this punctuation counts like a separator
    491      return CHAR_CLASS_SEPARATOR;
    492    }
    493 
    494    // check the left-hand character
    495    if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
    496    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
    497      return CHAR_CLASS_SEPARATOR;
    498    // If the previous charatcer is a word-char, make sure that it's not a
    499    // special dot character.
    500    if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;
    501 
    502    // now we know left char is a word-char, check the right-hand character
    503    if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
    504      return CHAR_CLASS_SEPARATOR;
    505    }
    506 
    507    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
    508      return CHAR_CLASS_SEPARATOR;
    509    // If the next charatcer is a word-char, make sure that it's not a
    510    // special dot character.
    511    if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;
    512 
    513    // char on either side is a word, this counts as a word
    514    return CHAR_CLASS_WORD;
    515  }
    516 
    517  // The dot character, if appearing at the end of a word, should
    518  // be considered part of that word.  Example: "etc.", or
    519  // abbreviations
    520  if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
    521      mDOMWordText[aIndex - 1] != '.' &&
    522      ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
    523    return CHAR_CLASS_WORD;
    524  }
    525 
    526  // all other punctuation
    527  if (charCategory == nsUGenCategory::kSeparator ||
    528      charCategory == nsUGenCategory::kOther ||
    529      charCategory == nsUGenCategory::kPunctuation ||
    530      charCategory == nsUGenCategory::kSymbol) {
    531    // Don't break on hyphens, as hunspell handles them on its own.
    532    if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
    533        mDOMWordText[aIndex - 1] != '-' &&
    534        ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
    535      // A hyphen is only meaningful as a separator inside a word
    536      // if the previous and next characters are a word character.
    537      if (aIndex == int32_t(mDOMWordText.Length()) - 1)
    538        return CHAR_CLASS_SEPARATOR;
    539      if (mDOMWordText[aIndex + 1] != '.' &&
    540          ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
    541        return CHAR_CLASS_WORD;
    542    }
    543    return CHAR_CLASS_SEPARATOR;
    544  }
    545 
    546  // any other character counts as a word
    547  return CHAR_CLASS_WORD;
    548 }
    549 
    550 // WordSplitState::Advance
    551 template <class T>
    552 void WordSplitState<T>::Advance() {
    553  MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");
    554  MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
    555             "Length beyond end");
    556 
    557  mDOMWordOffset++;
    558  if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
    559    mCurCharClass = CHAR_CLASS_END_OF_INPUT;
    560  else
    561    mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
    562 }
    563 
    564 // WordSplitState::AdvanceThroughSeparators
    565 template <class T>
    566 void WordSplitState<T>::AdvanceThroughSeparators() {
    567  while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
    568 }
    569 
    570 // WordSplitState::AdvanceThroughWord
    571 template <class T>
    572 void WordSplitState<T>::AdvanceThroughWord() {
    573  while (mCurCharClass == CHAR_CLASS_WORD) Advance();
    574 }
    575 
    576 // WordSplitState::IsSpecialWord
    577 template <class T>
    578 bool WordSplitState<T>::IsSpecialWord() const {
    579  // Search for email addresses. We simply define these as any sequence of
    580  // characters with an '@' character in the middle. The DOM word is already
    581  // split on whitepace, so we know that everything to the end is the address
    582  int32_t firstColon = -1;
    583  for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
    584    if (mDOMWordText[i] == '@') {
    585      // only accept this if there are unambiguous word characters (don't bother
    586      // recursing to disambiguate apostrophes) on each side. This prevents
    587      // classifying, e.g. "@home" as an email address
    588 
    589      // Use this condition to only accept words with '@' in the middle of
    590      // them. It works, but the inlinespellcker doesn't like this. The problem
    591      // is that you type "fhsgfh@" that's a misspelled word followed by a
    592      // symbol, but when you type another letter "fhsgfh@g" that first word
    593      // need to be unmarked misspelled. It doesn't do this. it only checks the
    594      // current position for potentially removing a spelling range.
    595      if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
    596          i < (int32_t)mDOMWordText.Length() - 1 &&
    597          ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
    598        return true;
    599      }
    600    } else if (mDOMWordText[i] == ':' && firstColon < 0) {
    601      firstColon = i;
    602 
    603      // If the first colon is followed by a slash, consider it a URL
    604      // This will catch things like asdf://foo.com
    605      if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
    606          mDOMWordText[firstColon + 1] == '/') {
    607        return true;
    608      }
    609    }
    610  }
    611 
    612  // Check the text before the first colon against some known protocols. It
    613  // is impossible to check against all protocols, especially since you can
    614  // plug in new protocols. We also don't want to waste time here checking
    615  // against a lot of obscure protocols.
    616  if (firstColon > mDOMWordOffset) {
    617    nsString protocol(
    618        Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
    619    if (protocol.EqualsIgnoreCase("http") ||
    620        protocol.EqualsIgnoreCase("https") ||
    621        protocol.EqualsIgnoreCase("news") ||
    622        protocol.EqualsIgnoreCase("file") ||
    623        protocol.EqualsIgnoreCase("javascript") ||
    624        protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
    625      return true;
    626    }
    627  }
    628 
    629  // not anything special
    630  return false;
    631 }
    632 
    633 // WordSplitState::ShouldSkipWord
    634 template <class T>
    635 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
    636  int32_t last = aStart + aLength;
    637 
    638  // check to see if the word contains a digit
    639  for (int32_t i = aStart; i < last; i++) {
    640    if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
    641        nsUGenCategory::kNumber) {
    642      return true;
    643    }
    644  }
    645 
    646  // not special
    647  return false;
    648 }
    649 
    650 template <class T>
    651 Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(
    652    const int32_t aBeforeOffset) const {
    653  for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {
    654    if (IsDOMWordSeparator(mDOMWordText[i]) ||
    655        (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
    656         ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
    657      // Be greedy, find as many separators as we can
    658      for (int32_t j = i - 1; j >= 0; --j) {
    659        if (IsDOMWordSeparator(mDOMWordText[j]) ||
    660            (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
    661             ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
    662          i = j;
    663        } else {
    664          break;
    665        }
    666      }
    667      return Some(i);
    668    }
    669  }
    670  return Nothing();
    671 }
    672 
    673 template <>
    674 char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
    675    int32_t aIndex) const {
    676  return mDOMWordText[aIndex];
    677 }
    678 
    679 template <>
    680 char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
    681    int32_t aIndex) const {
    682  return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
    683 }
    684 
    685 static inline bool IsBRElement(nsINode* aNode) {
    686  return aNode->IsHTMLElement(nsGkAtoms::br);
    687 }
    688 
    689 /**
    690 * Given a TextNode, finds the last sequence of DOM word separators before
    691 * aBeforeOffset and returns the offset to its first element.
    692 *
    693 * @param aContent the TextNode to check.
    694 * @param aBeforeOffset the offset in the TextNode before which we will search
    695 *        for the DOM separator. You can pass INT32_MAX to search the entire
    696 *        length of the string.
    697 */
    698 static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
    699    nsIContent* aContent, int32_t aBeforeOffset) {
    700  const dom::CharacterDataBuffer* characterDataBuffer =
    701      aContent->GetCharacterDataBuffer();
    702  MOZ_ASSERT(characterDataBuffer, "Where is our text?");
    703  int32_t end =
    704      std::min(aBeforeOffset, int32_t(characterDataBuffer->GetLength()));
    705 
    706  if (characterDataBuffer->Is2b()) {
    707    nsDependentSubstring targetText(characterDataBuffer->Get2b(), end);
    708    WordSplitState<nsDependentSubstring> state(targetText);
    709    return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
    710  }
    711 
    712  nsDependentCSubstring targetText(characterDataBuffer->Get1b(), end);
    713  WordSplitState<nsDependentCSubstring> state(targetText);
    714  return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
    715 }
    716 
    717 /**
    718 * Check if there's a DOM word separator before aBeforeOffset in this node.
    719 * Always returns true if it's a BR element.
    720 * aSeparatorOffset is set to the index of the first character in the last
    721 * separator if any is found (0 for BR elements).
    722 *
    723 * This function does not modify aSeparatorOffset when it returns false.
    724 */
    725 static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
    726                                     int32_t* aSeparatorOffset) {
    727  if (IsBRElement(aNode)) {
    728    *aSeparatorOffset = 0;
    729    return true;
    730  }
    731 
    732  if (!IsSpellCheckingTextNode(aNode)) return false;
    733 
    734  const Maybe<int32_t> separatorOffset =
    735      FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),
    736                                               aBeforeOffset);
    737  if (separatorOffset) {
    738    *aSeparatorOffset = *separatorOffset;
    739    return true;
    740  }
    741 
    742  return false;
    743 }
    744 
    745 static bool IsBreakElement(nsINode* aNode) {
    746  if (!aNode->IsElement()) {
    747    return false;
    748  }
    749 
    750  dom::Element* element = aNode->AsElement();
    751  if (element->IsHTMLElement(nsGkAtoms::br)) {
    752    return true;
    753  }
    754 
    755  // If we don't have a frame, we don't consider ourselves a break
    756  // element.  In particular, words can span us.
    757  nsIFrame* frame = element->GetPrimaryFrame();
    758  if (!frame) {
    759    return false;
    760  }
    761 
    762  auto* disp = frame->StyleDisplay();
    763  // Anything that's not an inline element is a break element.
    764  // XXXbz should replaced inlines be break elements, though?
    765  // Also should inline-block and such be break elements?
    766  //
    767  // FIXME(emilio): We should teach the spell checker to deal with generated
    768  // content (it doesn't at all), then remove the IsListItem() check, as there
    769  // could be no marker, etc...
    770  return !disp->IsInlineFlow() || disp->IsListItem();
    771 }
    772 
    773 struct CheckLeavingBreakElementClosure {
    774  bool mLeftBreakElement;
    775 };
    776 
    777 static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
    778  CheckLeavingBreakElementClosure* cl =
    779      static_cast<CheckLeavingBreakElementClosure*>(aClosure);
    780  if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
    781    cl->mLeftBreakElement = true;
    782  }
    783 }
    784 
    785 void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
    786  nsAutoString result;
    787  ::NormalizeWord(aWord, 0, aWord.Length(), result);
    788  aWord = result;
    789 }
    790 
    791 void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
    792    NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {
    793  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));
    794 
    795  mBegin = std::move(aBegin);
    796  mEnd = std::move(aEnd);
    797 
    798  // First we have to work backwards from mBegin to find a text node
    799  // containing a DOM word separator, a non-inline-element
    800  // boundary, or the hard start node. That's where we'll start building the
    801  // soft string from.
    802  nsINode* node = mBegin.mNode;
    803  int32_t firstOffsetInNode = 0;
    804  int32_t checkBeforeOffset = mBegin.mOffset;
    805  while (node) {
    806    if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
    807      if (node == mBegin.mNode) {
    808        // If we find a word separator on the first node, look at the preceding
    809        // word on the text node as well.
    810        if (firstOffsetInNode > 0) {
    811          // Try to find the previous word boundary in the current node. If
    812          // we can't find one, start checking previous sibling nodes (if any
    813          // adjacent ones exist) to see if we can find any text nodes with
    814          // DOM word separators. We bail out as soon as we see a node that is
    815          // not a text node, or we run out of previous sibling nodes. In the
    816          // event that we simply cannot find any preceding word separator, the
    817          // offset is set to 0, and the soft text beginning node is set to the
    818          // "most previous" text node before the original starting node, or
    819          // kept at the original starting node if no previous text nodes exist.
    820          int32_t newOffset = 0;
    821          if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
    822                                        &newOffset)) {
    823            nsIContent* prevNode = node->GetPreviousSibling();
    824            while (prevNode && IsSpellCheckingTextNode(prevNode)) {
    825              mBegin.mNode = prevNode;
    826              const Maybe<int32_t> separatorOffset =
    827                  FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);
    828              if (separatorOffset) {
    829                newOffset = *separatorOffset;
    830                break;
    831              }
    832              prevNode = prevNode->GetPreviousSibling();
    833            }
    834          }
    835          firstOffsetInNode = newOffset;
    836        } else {
    837          firstOffsetInNode = 0;
    838        }
    839 
    840        MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
    841                ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,
    842                 mBegin.mOffset, firstOffsetInNode));
    843        mBegin.mOffset = firstOffsetInNode;
    844      }
    845      break;
    846    }
    847    checkBeforeOffset = INT32_MAX;
    848    if (IsBreakElement(node)) {
    849      // Since GerPrevNode follows tree *preorder*, we're about to traverse up
    850      // out of 'node'. Since node induces breaks (e.g., it's a block), don't
    851      // bother trying to look outside it, just stop now.
    852      break;
    853    }
    854    // GetPreviousContent below expects aRootNode to be an ancestor of node.
    855    if (!node->IsInclusiveDescendantOf(aRootNode)) {
    856      break;
    857    }
    858    node = node->GetPrevNode(aRootNode);
    859  }
    860 
    861  // Now build up the string moving forward through the DOM until we reach
    862  // the soft end and *then* see a DOM word separator, a non-inline-element
    863  // boundary, or the hard end node.
    864  mValue.Truncate();
    865  mDOMMapping.Clear();
    866  bool seenSoftEnd = false;
    867  // Leave this outside the loop so large heap string allocations can be reused
    868  // across iterations
    869  while (node) {
    870    if (node == mEnd.mNode) {
    871      seenSoftEnd = true;
    872    }
    873 
    874    bool exit = false;
    875    if (IsSpellCheckingTextNode(node)) {
    876      nsIContent* content = static_cast<nsIContent*>(node);
    877      MOZ_ASSERT(content, "Where is our content?");
    878      const dom::CharacterDataBuffer* characterDataBuffer =
    879          content->GetCharacterDataBuffer();
    880      MOZ_ASSERT(characterDataBuffer, "Where is our text?");
    881      uint32_t lastOffsetInNode = characterDataBuffer->GetLength();
    882 
    883      if (seenSoftEnd) {
    884        // check whether we can stop after this
    885        for (uint32_t i =
    886                 node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0;
    887             i < characterDataBuffer->GetLength(); ++i) {
    888          if (IsDOMWordSeparator(characterDataBuffer->CharAt(i))) {
    889            exit = true;
    890            // stop at the first separator after the soft end point
    891            lastOffsetInNode = i;
    892            break;
    893          }
    894        }
    895      }
    896 
    897      if (firstOffsetInNode >= 0 &&
    898          static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) {
    899        const uint32_t len = lastOffsetInNode - firstOffsetInNode;
    900        mDOMMapping.AppendElement(DOMTextMapping(
    901            NodeOffset(node, firstOffsetInNode), mValue.Length(), len));
    902 
    903        const bool ok = characterDataBuffer->AppendTo(
    904            mValue, static_cast<uint32_t>(firstOffsetInNode), len,
    905            mozilla::fallible);
    906        if (!ok) {
    907          // probably out of memory, remove from mDOMMapping
    908          mDOMMapping.RemoveLastElement();
    909          exit = true;
    910        }
    911      }
    912 
    913      firstOffsetInNode = 0;
    914    }
    915 
    916    if (exit) break;
    917 
    918    CheckLeavingBreakElementClosure closure = {false};
    919    node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);
    920    if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
    921      // We left, or are entering, a break element (e.g., block). Maybe we can
    922      // stop now.
    923      if (seenSoftEnd) break;
    924      // Record the break
    925      mValue.Append(' ');
    926    }
    927  }
    928 
    929  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
    930          ("%s: got DOM string: %s", __FUNCTION__,
    931           NS_ConvertUTF16toUTF8(mValue).get()));
    932 }
    933 
    934 auto mozInlineSpellWordUtil::BuildRealWords() const
    935    -> Result<RealWords, nsresult> {
    936  // This is pretty simple. We just have to walk mSoftText.GetValue(),
    937  // tokenizing it into "real words". We do an outer traversal of words
    938  // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
    939  // those DOM words
    940  int32_t wordStart = -1;
    941  RealWords realWords;
    942  for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {
    943    if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {
    944      if (wordStart >= 0) {
    945        nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);
    946        if (NS_FAILED(rv)) {
    947          return Err(rv);
    948        }
    949        wordStart = -1;
    950      }
    951    } else {
    952      if (wordStart < 0) {
    953        wordStart = i;
    954      }
    955    }
    956  }
    957  if (wordStart >= 0) {
    958    nsresult rv = SplitDOMWordAndAppendTo(
    959        wordStart, mSoftText.GetValue().Length(), realWords);
    960    if (NS_FAILED(rv)) {
    961      return Err(rv);
    962    }
    963  }
    964 
    965  return realWords;
    966 }
    967 
    968 /*********** DOM/realwords<->mSoftText.GetValue() mapping functions
    969 * ************/
    970 
    971 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
    972    const NodeOffset& aNodeOffset) const {
    973  if (!mSoftText.mIsValid) {
    974    NS_ERROR("Soft text must be valid if we're to map into it");
    975    return -1;
    976  }
    977 
    978  for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {
    979    const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];
    980    if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
    981      // Allow offsets at either end of the string, in particular, allow the
    982      // offset that's at the end of the contributed string
    983      int32_t offsetInContributedString =
    984          aNodeOffset.mOffset - map.mNodeOffset.mOffset;
    985      if (offsetInContributedString >= 0 &&
    986          offsetInContributedString <= map.mLength)
    987        return map.mSoftTextOffset + offsetInContributedString;
    988      return -1;
    989    }
    990  }
    991  return -1;
    992 }
    993 
    994 namespace {
    995 
    996 template <class T>
    997 class FirstLargerOffset {
    998  int32_t mSoftTextOffset;
    999 
   1000 public:
   1001  explicit FirstLargerOffset(int32_t aSoftTextOffset)
   1002      : mSoftTextOffset(aSoftTextOffset) {}
   1003  int operator()(const T& t) const {
   1004    // We want the first larger offset, so never return 0 (which would
   1005    // short-circuit evaluation before finding the last such offset).
   1006    return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
   1007  }
   1008 };
   1009 
   1010 template <class T>
   1011 bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
   1012                              int32_t aSoftTextOffset, size_t* aIndex) {
   1013  if (aContainer.Length() == 0) {
   1014    return false;
   1015  }
   1016 
   1017  BinarySearchIf(aContainer, 0, aContainer.Length(),
   1018                 FirstLargerOffset<T>(aSoftTextOffset), aIndex);
   1019  if (*aIndex > 0) {
   1020    // There was at least one mapping with offset <= aSoftTextOffset. Step back
   1021    // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
   1022    *aIndex -= 1;
   1023  } else {
   1024    // Every mapping had offset greater than aSoftTextOffset.
   1025    MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
   1026  }
   1027  return true;
   1028 }
   1029 
   1030 }  // namespace
   1031 
   1032 NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
   1033    int32_t aSoftTextOffset, DOMMapHint aHint) const {
   1034  MOZ_ASSERT(mSoftText.mIsValid,
   1035             "Soft text must be valid if we're to map out of it");
   1036  if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);
   1037 
   1038  // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
   1039  size_t index;
   1040  bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),
   1041                                        aSoftTextOffset, &index);
   1042  if (!found) {
   1043    return NodeOffset(nullptr, -1);
   1044  }
   1045 
   1046  // 'index' is now the last mapping, if any, such that
   1047  // mSoftTextOffset <= aSoftTextOffset.
   1048  // If we're doing HINT_END, then we may want to return the end of the
   1049  // the previous mapping instead of the start of this mapping
   1050  if (aHint == HINT_END && index > 0) {
   1051    const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];
   1052    if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
   1053      return NodeOffset(map.mNodeOffset.mNode,
   1054                        map.mNodeOffset.mOffset + map.mLength);
   1055  }
   1056 
   1057  // We allow ourselves to return the end of this mapping even if we're
   1058  // doing HINT_START. This will only happen if there is no mapping which this
   1059  // point is the start of. I'm not 100% sure this is OK...
   1060  const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];
   1061  int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
   1062  if (offset >= 0 && offset <= map.mLength)
   1063    return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
   1064 
   1065  return NodeOffset(nullptr, -1);
   1066 }
   1067 
   1068 // static
   1069 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
   1070                                      nsACString& aResult) {
   1071  switch (aHint) {
   1072    case HINT_BEGIN:
   1073      aResult.AssignLiteral("begin");
   1074      break;
   1075    case HINT_END:
   1076      aResult.AssignLiteral("end");
   1077      break;
   1078  }
   1079 }
   1080 
   1081 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
   1082    int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
   1083  if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
   1084    nsAutoCString hint;
   1085    mozInlineSpellWordUtil::ToString(aHint, hint);
   1086 
   1087    MOZ_LOG(
   1088        sInlineSpellWordUtilLog, LogLevel::Debug,
   1089        ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
   1090         aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
   1091  }
   1092 
   1093  MOZ_ASSERT(mSoftText.mIsValid,
   1094             "Soft text must be valid if we're to map out of it");
   1095  if (!mSoftText.mIsValid) return -1;
   1096 
   1097  // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
   1098  // <= aSoftTextOffset
   1099  size_t index;
   1100  bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
   1101  if (!found) {
   1102    return -1;
   1103  }
   1104 
   1105  // 'index' is now the last word, if any, such that
   1106  // mSoftTextOffset <= aSoftTextOffset.
   1107  // If we're doing HINT_END, then we may want to return the end of the
   1108  // the previous word instead of the start of this word
   1109  if (aHint == HINT_END && index > 0) {
   1110    const RealWord& word = mRealWords[index - 1];
   1111    if (word.EndOffset() == aSoftTextOffset) {
   1112      return index - 1;
   1113    }
   1114  }
   1115 
   1116  // We allow ourselves to return the end of this word even if we're
   1117  // doing HINT_BEGIN. This will only happen if there is no word which this
   1118  // point is the start of. I'm not 100% sure this is OK...
   1119  const RealWord& word = mRealWords[index];
   1120  int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
   1121  if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;
   1122 
   1123  if (aSearchForward) {
   1124    if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
   1125      // All words have mSoftTextOffset > aSoftTextOffset
   1126      return 0;
   1127    }
   1128    // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
   1129    // Word index+1, if it exists, will be the first with
   1130    // mSoftTextOffset > aSoftTextOffset.
   1131    if (index + 1 < mRealWords.Length()) return index + 1;
   1132  }
   1133 
   1134  return -1;
   1135 }
   1136 
   1137 // mozInlineSpellWordUtil::SplitDOMWordAndAppendTo
   1138 
   1139 nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
   1140    int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {
   1141  nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);
   1142  WordSplitState<nsDependentSubstring> state(targetText);
   1143  state.mCurCharClass = state.ClassifyCharacter(0, true);
   1144 
   1145  state.AdvanceThroughSeparators();
   1146  if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
   1147    int32_t specialWordLength =
   1148        state.mDOMWordText.Length() - state.mDOMWordOffset;
   1149    if (!aRealWords.AppendElement(
   1150            RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
   1151            fallible)) {
   1152      return NS_ERROR_OUT_OF_MEMORY;
   1153    }
   1154 
   1155    return NS_OK;
   1156  }
   1157 
   1158  while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
   1159    state.AdvanceThroughSeparators();
   1160    if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;
   1161 
   1162    // save the beginning of the word
   1163    int32_t wordOffset = state.mDOMWordOffset;
   1164 
   1165    // find the end of the word
   1166    state.AdvanceThroughWord();
   1167    int32_t wordLen = state.mDOMWordOffset - wordOffset;
   1168    if (!aRealWords.AppendElement(
   1169            RealWord(aStart + wordOffset, wordLen,
   1170                     !state.ShouldSkipWord(wordOffset, wordLen)),
   1171            fallible)) {
   1172      return NS_ERROR_OUT_OF_MEMORY;
   1173    }
   1174  }
   1175 
   1176  return NS_OK;
   1177 }