tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsPlainTextSerializer.h (13647B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 
      3 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      4 /* This Source Code Form is subject to the terms of the Mozilla Public
      5 * License, v. 2.0. If a copy of the MPL was not distributed with this
      6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      7 
      8 /*
      9 * nsIContentSerializer implementation that can be used with an
     10 * nsIDocumentEncoder to convert a DOM into plaintext in a nice way
     11 * (eg for copy/paste as plaintext).
     12 */
     13 
     14 #ifndef nsPlainTextSerializer_h__
     15 #define nsPlainTextSerializer_h__
     16 
     17 #include <stack>
     18 
     19 #include "mozilla/Maybe.h"
     20 #include "nsAtom.h"
     21 #include "nsCycleCollectionParticipant.h"
     22 #include "nsIContentSerializer.h"
     23 #include "nsIDocumentEncoder.h"
     24 #include "nsString.h"
     25 #include "nsTArray.h"
     26 
     27 class nsIContent;
     28 
     29 namespace mozilla::dom {
     30 class DocumentType;
     31 class Element;
     32 }  // namespace mozilla::dom
     33 
     34 class nsPlainTextSerializer final : public nsIContentSerializer {
     35 public:
     36  nsPlainTextSerializer();
     37 
     38  NS_DECL_CYCLE_COLLECTING_ISUPPORTS
     39  NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer)
     40 
     41  // nsIContentSerializer
     42  NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn,
     43                  const mozilla::Encoding* aEncoding, bool aIsCopying,
     44                  bool aIsWholeDocument, bool* aNeedsPreformatScanning,
     45                  nsAString& aOutput) override;
     46 
     47  NS_IMETHOD AppendText(mozilla::dom::Text* aText, int32_t aStartOffset,
     48                        int32_t aEndOffset) override;
     49  NS_IMETHOD AppendCDATASection(mozilla::dom::Text* aCDATASection,
     50                                int32_t aStartOffset,
     51                                int32_t aEndOffset) override;
     52  NS_IMETHOD AppendProcessingInstruction(
     53      mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset,
     54      int32_t aEndOffset) override {
     55    return NS_OK;
     56  }
     57  NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment,
     58                           int32_t aStartOffset, int32_t aEndOffset) override {
     59    return NS_OK;
     60  }
     61  NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override {
     62    return NS_OK;
     63  }
     64  NS_IMETHOD AppendElementStart(
     65      mozilla::dom::Element* aElement,
     66      mozilla::dom::Element* aOriginalElement) override;
     67  NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement,
     68                              mozilla::dom::Element* aOriginalElement) override;
     69 
     70  NS_IMETHOD FlushAndFinish() override;
     71 
     72  NS_IMETHOD Finish() override;
     73 
     74  NS_IMETHOD GetOutputLength(uint32_t& aLength) const override;
     75 
     76  NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override;
     77 
     78  NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override;
     79  NS_IMETHOD ForgetElementForPreformat(
     80      mozilla::dom::Element* aElement) override;
     81 
     82  static void HardWrapString(nsAString& aString, uint32_t aWrapCols,
     83                             int32_t flags);
     84 
     85 private:
     86  ~nsPlainTextSerializer();
     87 
     88  nsresult GetAttributeValue(mozilla::dom::Element* aElement,
     89                             const nsAtom* aName, nsString& aValueRet) const;
     90  void AddToLine(const char16_t* aStringToAdd, int32_t aLength);
     91 
     92  void MaybeWrapAndOutputCompleteLines();
     93 
     94  void EndHardBreakLine();
     95  void ResetStateAfterLine() {
     96    mInWhitespace = true;
     97    mLineBreakDue = false;
     98    mFloatingLines = -1;
     99  }
    100 
    101  void EnsureVerticalSpace(int32_t noOfRows);
    102 
    103  void ConvertToLinesAndOutput(const nsAString& aString);
    104 
    105  void Write(const nsAString& aString);
    106 
    107  // @return true, iff the elements' whitespace and newline characters have to
    108  //         be preserved according to its style or because it's a `<pre>`
    109  //         element.
    110  bool IsElementPreformatted() const;
    111  bool IsInOL() const;
    112  bool IsInOlOrUl() const;
    113  bool IsCurrentNodeConverted(mozilla::dom::Element* aElement) const;
    114  bool MustSuppressLeaf() const;
    115 
    116  /**
    117   * Returns the local name of the element as an atom if the element is an
    118   * HTML element and the atom is a static atom. Otherwise, nullptr is returned.
    119   */
    120  static nsAtom* GetIdForContent(nsIContent* aContent);
    121  nsresult DoOpenContainer(mozilla::dom::Element* aElement, const nsAtom* aTag);
    122  void OpenContainerForOutputFormatted(mozilla::dom::Element* aElement,
    123                                       const nsAtom* aTag);
    124  nsresult DoCloseContainer(mozilla::dom::Element* aElement,
    125                            const nsAtom* aTag);
    126  void CloseContainerForOutputFormatted(mozilla::dom::Element* aElement,
    127                                        const nsAtom* aTag);
    128  nsresult DoAddLeaf(mozilla::dom::Element* aElement, const nsAtom* aTag);
    129 
    130  void DoAddText(const nsAString& aText);
    131  void DoAddLineBreak();
    132 
    133  inline bool DoOutput() const { return mHeadLevel == 0; }
    134 
    135  static inline bool IsQuotedLine(const nsAString& aLine) {
    136    return !aLine.IsEmpty() && aLine.First() == char16_t('>');
    137  }
    138 
    139  // Stack handling functions
    140  bool GetLastBool(const nsTArray<bool>& aStack);
    141  void SetLastBool(nsTArray<bool>& aStack, bool aValue);
    142  void PushBool(nsTArray<bool>& aStack, bool aValue);
    143  bool PopBool(nsTArray<bool>& aStack);
    144 
    145  bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const;
    146 
    147  // @return true, iff the elements' whitespace and newline characters have to
    148  //         be preserved according to its style or because it's a `<pre>`
    149  //         element.
    150  static bool IsElementPreformatted(mozilla::dom::Element* aElement);
    151 
    152  // https://drafts.csswg.org/css-display/#block-level
    153  static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement);
    154 
    155 private:
    156  uint32_t mHeadLevel = 0;
    157 
    158  class Settings {
    159   public:
    160    enum class HeaderStrategy {
    161      kNoIndentation,
    162      kIndentIncreasedWithHeaderLevel,
    163      kNumberHeadingsAndIndentSlightly
    164    };
    165 
    166    // May adapt the flags.
    167    //
    168    // @param aFlags As defined in nsIDocumentEncoder.idl.
    169    void Init(int32_t aFlags, uint32_t aWrapColumn);
    170 
    171    // Pref: converter.html2txt.structs.
    172    bool GetStructs() const { return mStructs; }
    173 
    174    // Pref: converter.html2txt.header_strategy.
    175    HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; }
    176 
    177    // @return As defined in nsIDocumentEncoder.idl.
    178    int32_t GetFlags() const { return mFlags; }
    179 
    180    // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of
    181    // multiple bitwise or'd flags.
    182    bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; }
    183 
    184    // Whether the output should include ruby annotations.
    185    bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; }
    186 
    187    uint32_t GetWrapColumn() const { return mWrapColumn; }
    188 
    189    bool MayWrap() const {
    190      return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted |
    191                                        nsIDocumentEncoder::OutputWrap);
    192    }
    193 
    194    bool MayBreakLines() const {
    195      return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking);
    196    }
    197 
    198   private:
    199    // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy.
    200    static HeaderStrategy Convert(int32_t aPrefHeaderStrategy);
    201 
    202    // Pref: converter.html2txt.structs.
    203    bool mStructs = true;
    204 
    205    // Pref: converter.html2txt.header_strategy.
    206    HeaderStrategy mHeaderStrategy =
    207        HeaderStrategy::kIndentIncreasedWithHeaderLevel;
    208 
    209    // Flags defined in nsIDocumentEncoder.idl.
    210    int32_t mFlags = 0;
    211 
    212    // Whether the output should include ruby annotations.
    213    bool mWithRubyAnnotation = false;
    214 
    215    // The wrap column is how many fixed-pitch narrow
    216    // (https://unicode.org/reports/tr11/) (e.g. Latin) characters
    217    // should be allowed on a line. There could be less chars if the chars
    218    // are wider than latin chars of more if the chars are more narrow.
    219    uint32_t mWrapColumn = 0;
    220  };
    221 
    222  Settings mSettings;
    223 
    224  struct Indentation {
    225    // The number of space characters to be inserted including the length of
    226    // mHeader.
    227    int32_t mLength = 0;
    228 
    229    // The header that has to be written in the indent.
    230    // That could be, for instance, the bullet in a bulleted list.
    231    nsString mHeader;
    232  };
    233 
    234  class CurrentLine {
    235   public:
    236    void ResetContentAndIndentationHeader();
    237 
    238    // @param aFlags As defined in nsIDocumentEncoder.idl.
    239    void MaybeReplaceNbspsInContent(int32_t aFlags);
    240 
    241    void CreateQuotesAndIndent(nsAString& aResult) const;
    242 
    243    bool HasContentOrIndentationHeader() const {
    244      return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty();
    245    }
    246 
    247    // @param aLineBreaker May be nullptr.
    248    int32_t FindWrapIndexForContent(uint32_t aWrapColumn,
    249                                    bool aUseLineBreaker) const;
    250 
    251    // @return Combined width of cite quote level and indentation.
    252    uint32_t DeterminePrefixWidth() const {
    253      // XXX: Should calculate prefixwidth with GetUnicharStringWidth
    254      return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) +
    255             mIndentation.mLength + uint32_t(mSpaceStuffed);
    256    }
    257 
    258    Indentation mIndentation;
    259 
    260    // The number of '>' characters.
    261    int32_t mCiteQuoteLevel = 0;
    262 
    263    // Whether this line is getting space-stuffed, see
    264    // https://datatracker.ietf.org/doc/html/rfc2646#section-4.4
    265    bool mSpaceStuffed = false;
    266 
    267    // Excludes indentation and quotes.
    268    nsString mContent;
    269  };
    270 
    271  CurrentLine mCurrentLine;
    272 
    273  class OutputManager {
    274   public:
    275    /**
    276     *  @param aFlags As defined in nsIDocumentEncoder.idl.
    277     *  @param aOutput An empty string.
    278     */
    279    OutputManager(int32_t aFlags, nsAString& aOutput);
    280 
    281    enum class StripTrailingWhitespaces { kMaybe, kNo };
    282 
    283    void Append(const CurrentLine& aCurrentLine,
    284                StripTrailingWhitespaces aStripTrailingWhitespaces);
    285 
    286    /**
    287     * @param aString Last character is expected to not be a line break.
    288     */
    289    void Append(const nsAString& aString);
    290 
    291    void AppendLineBreak(bool aForceCRLF = false);
    292 
    293    /**
    294     * This empties the current line cache without adding a NEWLINE.
    295     * Should not be used if line wrapping is of importance since
    296     * this function destroys the cache information.
    297     *
    298     * It will also write indentation and quotes if we believe us to be
    299     * at the start of the line.
    300     */
    301    void Flush(CurrentLine& aCurrentLine);
    302 
    303    bool IsAtFirstColumn() const { return mAtFirstColumn; }
    304 
    305    uint32_t GetOutputLength() const;
    306 
    307   private:
    308    // As defined in nsIDocumentEncoder.idl.
    309    const int32_t mFlags;
    310 
    311    nsAString& mOutput;
    312 
    313    bool mAtFirstColumn;
    314 
    315    nsString mLineBreak;
    316  };
    317 
    318  static void PerformWrapAndOutputCompleteLines(
    319      const Settings& aSettings, CurrentLine& aLine, OutputManager& aOutput,
    320      bool aUseLineBreaker, nsPlainTextSerializer* aSerializer);
    321  static void AppendLineToOutput(const Settings& aSettings, CurrentLine& aLine,
    322                                 OutputManager& aOutput);
    323 
    324  mozilla::Maybe<OutputManager> mOutputManager;
    325 
    326  // If we've just written out a cite blockquote, we need to remember it
    327  // so we don't duplicate spaces before a <pre wrap> (which mail uses to quote
    328  // old messages).
    329  bool mHasWrittenCiteBlockquote = false;
    330 
    331  int32_t mFloatingLines;  // To store the number of lazy line breaks
    332 
    333  // Treat quoted text as though it's preformatted -- don't wrap it.
    334  // Having it on a pref is a temporary measure, See bug 69638.
    335  int32_t mSpanLevel;
    336 
    337  int32_t mEmptyLines;  // Will be the number of empty lines before
    338                        // the current. 0 if we are starting a new
    339                        // line and -1 if we are in a line.
    340 
    341  bool mInWhitespace;
    342  bool mPreFormattedMail;  // we're dealing with special DOM
    343                           // used by Thunderbird code.
    344 
    345  // While handling a new tag, this variable should remind if any line break
    346  // is due because of a closing tag. Setting it to "TRUE" while closing the
    347  // tags. Hence opening tags are guaranteed to start with appropriate line
    348  // breaks.
    349  bool mLineBreakDue = false;
    350 
    351  bool mPreformattedBlockBoundary;
    352 
    353  int32_t mHeaderCounter[7]; /* For header-numbering:
    354                                Number of previous headers of
    355                                the same depth and in the same
    356                                section.
    357                                mHeaderCounter[1] for <h1> etc. */
    358 
    359  // For handling table rows
    360  AutoTArray<bool, 8> mHasWrittenCellsForRow;
    361 
    362  // Values gotten in OpenContainer that is (also) needed in CloseContainer
    363  AutoTArray<bool, 8> mIsInCiteBlockquote;
    364 
    365  // The tag stack: the stack of tags we're operating on, so we can nest.
    366  // The stack only ever points to static atoms, so they don't need to be
    367  // refcounted.
    368  const nsAtom** mTagStack;
    369  uint32_t mTagStackIndex;
    370 
    371  // The stack indicating whether the elements we've been operating on are
    372  // CSS preformatted elements, so that we can tell if the text inside them
    373  // should be formatted.
    374  std::stack<bool> mPreformatStack;
    375 
    376  // Content in the stack above this index should be ignored:
    377  uint32_t mIgnoreAboveIndex;
    378 
    379  // The stack for ordered lists
    380  AutoTArray<int32_t, 100> mOLStack;
    381 
    382  uint32_t mULCount;
    383 
    384  bool mUseLineBreaker = false;
    385 
    386  // Conveniance constant. It would be nice to have it as a const static
    387  // variable, but that causes issues with OpenBSD and module unloading.
    388  const nsString kSpace;
    389 
    390  // mIgnoredChildNodeLevel is used to tell if current node is an ignorable
    391  // child node. The initial value of mIgnoredChildNodeLevel is 0. When
    392  // serializer enters those specific nodes, mIgnoredChildNodeLevel increases
    393  // and is greater than 0. Otherwise when serializer leaves those nodes,
    394  // mIgnoredChildNodeLevel decreases.
    395  uint32_t mIgnoredChildNodeLevel = 0;
    396 };
    397 
    398 nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer);
    399 
    400 #endif