nsPlainTextSerializer.h (13647B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 3 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 4 /* This Source Code Form is subject to the terms of the Mozilla Public 5 * License, v. 2.0. If a copy of the MPL was not distributed with this 6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 7 8 /* 9 * nsIContentSerializer implementation that can be used with an 10 * nsIDocumentEncoder to convert a DOM into plaintext in a nice way 11 * (eg for copy/paste as plaintext). 12 */ 13 14 #ifndef nsPlainTextSerializer_h__ 15 #define nsPlainTextSerializer_h__ 16 17 #include <stack> 18 19 #include "mozilla/Maybe.h" 20 #include "nsAtom.h" 21 #include "nsCycleCollectionParticipant.h" 22 #include "nsIContentSerializer.h" 23 #include "nsIDocumentEncoder.h" 24 #include "nsString.h" 25 #include "nsTArray.h" 26 27 class nsIContent; 28 29 namespace mozilla::dom { 30 class DocumentType; 31 class Element; 32 } // namespace mozilla::dom 33 34 class nsPlainTextSerializer final : public nsIContentSerializer { 35 public: 36 nsPlainTextSerializer(); 37 38 NS_DECL_CYCLE_COLLECTING_ISUPPORTS 39 NS_DECL_CYCLE_COLLECTION_CLASS(nsPlainTextSerializer) 40 41 // nsIContentSerializer 42 NS_IMETHOD Init(uint32_t flags, uint32_t aWrapColumn, 43 const mozilla::Encoding* aEncoding, bool aIsCopying, 44 bool aIsWholeDocument, bool* aNeedsPreformatScanning, 45 nsAString& aOutput) override; 46 47 NS_IMETHOD AppendText(mozilla::dom::Text* aText, int32_t aStartOffset, 48 int32_t aEndOffset) override; 49 NS_IMETHOD AppendCDATASection(mozilla::dom::Text* aCDATASection, 50 int32_t aStartOffset, 51 int32_t aEndOffset) override; 52 NS_IMETHOD AppendProcessingInstruction( 53 mozilla::dom::ProcessingInstruction* aPI, int32_t aStartOffset, 54 int32_t aEndOffset) override { 55 return NS_OK; 56 } 57 NS_IMETHOD AppendComment(mozilla::dom::Comment* aComment, 58 int32_t aStartOffset, int32_t aEndOffset) override { 59 return NS_OK; 60 } 61 NS_IMETHOD AppendDoctype(mozilla::dom::DocumentType* aDoctype) override { 62 return NS_OK; 63 } 64 NS_IMETHOD AppendElementStart( 65 mozilla::dom::Element* aElement, 66 mozilla::dom::Element* aOriginalElement) override; 67 NS_IMETHOD AppendElementEnd(mozilla::dom::Element* aElement, 68 mozilla::dom::Element* aOriginalElement) override; 69 70 NS_IMETHOD FlushAndFinish() override; 71 72 NS_IMETHOD Finish() override; 73 74 NS_IMETHOD GetOutputLength(uint32_t& aLength) const override; 75 76 NS_IMETHOD AppendDocumentStart(mozilla::dom::Document* aDocument) override; 77 78 NS_IMETHOD ScanElementForPreformat(mozilla::dom::Element* aElement) override; 79 NS_IMETHOD ForgetElementForPreformat( 80 mozilla::dom::Element* aElement) override; 81 82 static void HardWrapString(nsAString& aString, uint32_t aWrapCols, 83 int32_t flags); 84 85 private: 86 ~nsPlainTextSerializer(); 87 88 nsresult GetAttributeValue(mozilla::dom::Element* aElement, 89 const nsAtom* aName, nsString& aValueRet) const; 90 void AddToLine(const char16_t* aStringToAdd, int32_t aLength); 91 92 void MaybeWrapAndOutputCompleteLines(); 93 94 void EndHardBreakLine(); 95 void ResetStateAfterLine() { 96 mInWhitespace = true; 97 mLineBreakDue = false; 98 mFloatingLines = -1; 99 } 100 101 void EnsureVerticalSpace(int32_t noOfRows); 102 103 void ConvertToLinesAndOutput(const nsAString& aString); 104 105 void Write(const nsAString& aString); 106 107 // @return true, iff the elements' whitespace and newline characters have to 108 // be preserved according to its style or because it's a `<pre>` 109 // element. 110 bool IsElementPreformatted() const; 111 bool IsInOL() const; 112 bool IsInOlOrUl() const; 113 bool IsCurrentNodeConverted(mozilla::dom::Element* aElement) const; 114 bool MustSuppressLeaf() const; 115 116 /** 117 * Returns the local name of the element as an atom if the element is an 118 * HTML element and the atom is a static atom. Otherwise, nullptr is returned. 119 */ 120 static nsAtom* GetIdForContent(nsIContent* aContent); 121 nsresult DoOpenContainer(mozilla::dom::Element* aElement, const nsAtom* aTag); 122 void OpenContainerForOutputFormatted(mozilla::dom::Element* aElement, 123 const nsAtom* aTag); 124 nsresult DoCloseContainer(mozilla::dom::Element* aElement, 125 const nsAtom* aTag); 126 void CloseContainerForOutputFormatted(mozilla::dom::Element* aElement, 127 const nsAtom* aTag); 128 nsresult DoAddLeaf(mozilla::dom::Element* aElement, const nsAtom* aTag); 129 130 void DoAddText(const nsAString& aText); 131 void DoAddLineBreak(); 132 133 inline bool DoOutput() const { return mHeadLevel == 0; } 134 135 static inline bool IsQuotedLine(const nsAString& aLine) { 136 return !aLine.IsEmpty() && aLine.First() == char16_t('>'); 137 } 138 139 // Stack handling functions 140 bool GetLastBool(const nsTArray<bool>& aStack); 141 void SetLastBool(nsTArray<bool>& aStack, bool aValue); 142 void PushBool(nsTArray<bool>& aStack, bool aValue); 143 bool PopBool(nsTArray<bool>& aStack); 144 145 bool IsIgnorableRubyAnnotation(const nsAtom* aTag) const; 146 147 // @return true, iff the elements' whitespace and newline characters have to 148 // be preserved according to its style or because it's a `<pre>` 149 // element. 150 static bool IsElementPreformatted(mozilla::dom::Element* aElement); 151 152 // https://drafts.csswg.org/css-display/#block-level 153 static bool IsCssBlockLevelElement(mozilla::dom::Element* aElement); 154 155 private: 156 uint32_t mHeadLevel = 0; 157 158 class Settings { 159 public: 160 enum class HeaderStrategy { 161 kNoIndentation, 162 kIndentIncreasedWithHeaderLevel, 163 kNumberHeadingsAndIndentSlightly 164 }; 165 166 // May adapt the flags. 167 // 168 // @param aFlags As defined in nsIDocumentEncoder.idl. 169 void Init(int32_t aFlags, uint32_t aWrapColumn); 170 171 // Pref: converter.html2txt.structs. 172 bool GetStructs() const { return mStructs; } 173 174 // Pref: converter.html2txt.header_strategy. 175 HeaderStrategy GetHeaderStrategy() const { return mHeaderStrategy; } 176 177 // @return As defined in nsIDocumentEncoder.idl. 178 int32_t GetFlags() const { return mFlags; } 179 180 // @param aFlag As defined in nsIDocumentEncoder.idl. May consist of 181 // multiple bitwise or'd flags. 182 bool HasFlag(int32_t aFlag) const { return mFlags & aFlag; } 183 184 // Whether the output should include ruby annotations. 185 bool GetWithRubyAnnotation() const { return mWithRubyAnnotation; } 186 187 uint32_t GetWrapColumn() const { return mWrapColumn; } 188 189 bool MayWrap() const { 190 return GetWrapColumn() && HasFlag(nsIDocumentEncoder::OutputFormatted | 191 nsIDocumentEncoder::OutputWrap); 192 } 193 194 bool MayBreakLines() const { 195 return !HasFlag(nsIDocumentEncoder::OutputDisallowLineBreaking); 196 } 197 198 private: 199 // @param aPrefHeaderStrategy Pref: converter.html2txt.header_strategy. 200 static HeaderStrategy Convert(int32_t aPrefHeaderStrategy); 201 202 // Pref: converter.html2txt.structs. 203 bool mStructs = true; 204 205 // Pref: converter.html2txt.header_strategy. 206 HeaderStrategy mHeaderStrategy = 207 HeaderStrategy::kIndentIncreasedWithHeaderLevel; 208 209 // Flags defined in nsIDocumentEncoder.idl. 210 int32_t mFlags = 0; 211 212 // Whether the output should include ruby annotations. 213 bool mWithRubyAnnotation = false; 214 215 // The wrap column is how many fixed-pitch narrow 216 // (https://unicode.org/reports/tr11/) (e.g. Latin) characters 217 // should be allowed on a line. There could be less chars if the chars 218 // are wider than latin chars of more if the chars are more narrow. 219 uint32_t mWrapColumn = 0; 220 }; 221 222 Settings mSettings; 223 224 struct Indentation { 225 // The number of space characters to be inserted including the length of 226 // mHeader. 227 int32_t mLength = 0; 228 229 // The header that has to be written in the indent. 230 // That could be, for instance, the bullet in a bulleted list. 231 nsString mHeader; 232 }; 233 234 class CurrentLine { 235 public: 236 void ResetContentAndIndentationHeader(); 237 238 // @param aFlags As defined in nsIDocumentEncoder.idl. 239 void MaybeReplaceNbspsInContent(int32_t aFlags); 240 241 void CreateQuotesAndIndent(nsAString& aResult) const; 242 243 bool HasContentOrIndentationHeader() const { 244 return !mContent.IsEmpty() || !mIndentation.mHeader.IsEmpty(); 245 } 246 247 // @param aLineBreaker May be nullptr. 248 int32_t FindWrapIndexForContent(uint32_t aWrapColumn, 249 bool aUseLineBreaker) const; 250 251 // @return Combined width of cite quote level and indentation. 252 uint32_t DeterminePrefixWidth() const { 253 // XXX: Should calculate prefixwidth with GetUnicharStringWidth 254 return (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1 : 0) + 255 mIndentation.mLength + uint32_t(mSpaceStuffed); 256 } 257 258 Indentation mIndentation; 259 260 // The number of '>' characters. 261 int32_t mCiteQuoteLevel = 0; 262 263 // Whether this line is getting space-stuffed, see 264 // https://datatracker.ietf.org/doc/html/rfc2646#section-4.4 265 bool mSpaceStuffed = false; 266 267 // Excludes indentation and quotes. 268 nsString mContent; 269 }; 270 271 CurrentLine mCurrentLine; 272 273 class OutputManager { 274 public: 275 /** 276 * @param aFlags As defined in nsIDocumentEncoder.idl. 277 * @param aOutput An empty string. 278 */ 279 OutputManager(int32_t aFlags, nsAString& aOutput); 280 281 enum class StripTrailingWhitespaces { kMaybe, kNo }; 282 283 void Append(const CurrentLine& aCurrentLine, 284 StripTrailingWhitespaces aStripTrailingWhitespaces); 285 286 /** 287 * @param aString Last character is expected to not be a line break. 288 */ 289 void Append(const nsAString& aString); 290 291 void AppendLineBreak(bool aForceCRLF = false); 292 293 /** 294 * This empties the current line cache without adding a NEWLINE. 295 * Should not be used if line wrapping is of importance since 296 * this function destroys the cache information. 297 * 298 * It will also write indentation and quotes if we believe us to be 299 * at the start of the line. 300 */ 301 void Flush(CurrentLine& aCurrentLine); 302 303 bool IsAtFirstColumn() const { return mAtFirstColumn; } 304 305 uint32_t GetOutputLength() const; 306 307 private: 308 // As defined in nsIDocumentEncoder.idl. 309 const int32_t mFlags; 310 311 nsAString& mOutput; 312 313 bool mAtFirstColumn; 314 315 nsString mLineBreak; 316 }; 317 318 static void PerformWrapAndOutputCompleteLines( 319 const Settings& aSettings, CurrentLine& aLine, OutputManager& aOutput, 320 bool aUseLineBreaker, nsPlainTextSerializer* aSerializer); 321 static void AppendLineToOutput(const Settings& aSettings, CurrentLine& aLine, 322 OutputManager& aOutput); 323 324 mozilla::Maybe<OutputManager> mOutputManager; 325 326 // If we've just written out a cite blockquote, we need to remember it 327 // so we don't duplicate spaces before a <pre wrap> (which mail uses to quote 328 // old messages). 329 bool mHasWrittenCiteBlockquote = false; 330 331 int32_t mFloatingLines; // To store the number of lazy line breaks 332 333 // Treat quoted text as though it's preformatted -- don't wrap it. 334 // Having it on a pref is a temporary measure, See bug 69638. 335 int32_t mSpanLevel; 336 337 int32_t mEmptyLines; // Will be the number of empty lines before 338 // the current. 0 if we are starting a new 339 // line and -1 if we are in a line. 340 341 bool mInWhitespace; 342 bool mPreFormattedMail; // we're dealing with special DOM 343 // used by Thunderbird code. 344 345 // While handling a new tag, this variable should remind if any line break 346 // is due because of a closing tag. Setting it to "TRUE" while closing the 347 // tags. Hence opening tags are guaranteed to start with appropriate line 348 // breaks. 349 bool mLineBreakDue = false; 350 351 bool mPreformattedBlockBoundary; 352 353 int32_t mHeaderCounter[7]; /* For header-numbering: 354 Number of previous headers of 355 the same depth and in the same 356 section. 357 mHeaderCounter[1] for <h1> etc. */ 358 359 // For handling table rows 360 AutoTArray<bool, 8> mHasWrittenCellsForRow; 361 362 // Values gotten in OpenContainer that is (also) needed in CloseContainer 363 AutoTArray<bool, 8> mIsInCiteBlockquote; 364 365 // The tag stack: the stack of tags we're operating on, so we can nest. 366 // The stack only ever points to static atoms, so they don't need to be 367 // refcounted. 368 const nsAtom** mTagStack; 369 uint32_t mTagStackIndex; 370 371 // The stack indicating whether the elements we've been operating on are 372 // CSS preformatted elements, so that we can tell if the text inside them 373 // should be formatted. 374 std::stack<bool> mPreformatStack; 375 376 // Content in the stack above this index should be ignored: 377 uint32_t mIgnoreAboveIndex; 378 379 // The stack for ordered lists 380 AutoTArray<int32_t, 100> mOLStack; 381 382 uint32_t mULCount; 383 384 bool mUseLineBreaker = false; 385 386 // Conveniance constant. It would be nice to have it as a const static 387 // variable, but that causes issues with OpenBSD and module unloading. 388 const nsString kSpace; 389 390 // mIgnoredChildNodeLevel is used to tell if current node is an ignorable 391 // child node. The initial value of mIgnoredChildNodeLevel is 0. When 392 // serializer enters those specific nodes, mIgnoredChildNodeLevel increases 393 // and is greater than 0. Otherwise when serializer leaves those nodes, 394 // mIgnoredChildNodeLevel decreases. 395 uint32_t mIgnoredChildNodeLevel = 0; 396 }; 397 398 nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer); 399 400 #endif