nsHTMLContentSerializer.cpp (15334B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 /* 8 * nsIContentSerializer implementation that can be used with an 9 * nsIDocumentEncoder to convert an HTML (not XHTML!) DOM to an HTML 10 * string that could be parsed into more or less the original DOM. 11 */ 12 13 #include "nsHTMLContentSerializer.h" 14 15 #include "mozilla/dom/Document.h" 16 #include "mozilla/dom/Element.h" 17 #include "nsAttrName.h" 18 #include "nsCRT.h" 19 #include "nsContentUtils.h" 20 #include "nsElementTable.h" 21 #include "nsEscape.h" 22 #include "nsGkAtoms.h" 23 #include "nsIContent.h" 24 #include "nsIDocumentEncoder.h" 25 #include "nsIScriptElement.h" 26 #include "nsIURI.h" 27 #include "nsNameSpaceManager.h" 28 #include "nsNetUtil.h" 29 #include "nsParserConstants.h" 30 #include "nsString.h" 31 #include "nsUnicharUtils.h" 32 33 using namespace mozilla::dom; 34 35 nsresult NS_NewHTMLContentSerializer(nsIContentSerializer** aSerializer) { 36 RefPtr<nsHTMLContentSerializer> it = new nsHTMLContentSerializer(); 37 it.forget(aSerializer); 38 return NS_OK; 39 } 40 41 nsHTMLContentSerializer::nsHTMLContentSerializer() { mIsHTMLSerializer = true; } 42 43 nsHTMLContentSerializer::~nsHTMLContentSerializer() = default; 44 45 NS_IMETHODIMP 46 nsHTMLContentSerializer::AppendDocumentStart(Document* aDocument) { 47 return NS_OK; 48 } 49 50 bool nsHTMLContentSerializer::SerializeHTMLAttributes( 51 Element* aElement, Element* aOriginalElement, nsAString& aTagPrefix, 52 const nsAString& aTagNamespaceURI, nsAtom* aTagName, int32_t aNamespace, 53 nsAString& aStr) { 54 MaybeSerializeIsValue(aElement, aStr); 55 56 int32_t count = aElement->GetAttrCount(); 57 if (!count) return true; 58 59 nsresult rv; 60 nsAutoString valueStr; 61 62 for (int32_t index = 0; index < count; index++) { 63 const nsAttrName* name = aElement->GetAttrNameAt(index); 64 int32_t namespaceID = name->NamespaceID(); 65 nsAtom* attrName = name->LocalName(); 66 67 // Filter out any attribute starting with [-|_]moz 68 nsDependentAtomString attrNameStr(attrName); 69 if (StringBeginsWith(attrNameStr, u"_moz"_ns) || 70 StringBeginsWith(attrNameStr, u"-moz"_ns)) { 71 continue; 72 } 73 aElement->GetAttr(namespaceID, attrName, valueStr); 74 75 if (mIsCopying && mIsFirstChildOfOL && aTagName == nsGkAtoms::li && 76 aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::value && 77 namespaceID == kNameSpaceID_None) { 78 // This is handled separately in SerializeLIValueAttribute() 79 continue; 80 } 81 bool isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr); 82 83 if (((attrName == nsGkAtoms::href && (namespaceID == kNameSpaceID_None || 84 namespaceID == kNameSpaceID_XLink)) || 85 (attrName == nsGkAtoms::src && namespaceID == kNameSpaceID_None))) { 86 // Make all links absolute when converting only the selection: 87 if (mFlags & nsIDocumentEncoder::OutputAbsoluteLinks) { 88 // Would be nice to handle OBJECT tags, but that gets more complicated 89 // since we have to search the tag list for CODEBASE as well. For now, 90 // just leave them relative. 91 nsIURI* uri = aElement->GetBaseURI(); 92 if (uri) { 93 nsAutoString absURI; 94 rv = NS_MakeAbsoluteURI(absURI, valueStr, uri); 95 if (NS_SUCCEEDED(rv)) { 96 valueStr = absURI; 97 } 98 } 99 } 100 } 101 102 if (mRewriteEncodingDeclaration && aTagName == nsGkAtoms::meta && 103 aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::content && 104 namespaceID == kNameSpaceID_None) { 105 // If we're serializing a <meta http-equiv="content-type">, 106 // use the proper value, rather than what's in the document. 107 nsAutoString header; 108 aElement->GetAttr(nsGkAtoms::httpEquiv, header); 109 if (header.LowerCaseEqualsLiteral("content-type")) { 110 valueStr = u"text/html; charset="_ns + NS_ConvertASCIItoUTF16(mCharset); 111 } 112 } 113 114 nsDependentAtomString nameStr(attrName); 115 nsAutoString prefix; 116 if (namespaceID == kNameSpaceID_XML) { 117 prefix.AssignLiteral(u"xml"); 118 } else if (namespaceID == kNameSpaceID_XLink) { 119 prefix.AssignLiteral(u"xlink"); 120 } 121 122 // Expand shorthand attribute. 123 if (aNamespace == kNameSpaceID_XHTML && namespaceID == kNameSpaceID_None && 124 IsShorthandAttr(attrName, aTagName) && valueStr.IsEmpty()) { 125 valueStr = nameStr; 126 } 127 NS_ENSURE_TRUE(SerializeAttr(prefix, nameStr, valueStr, aStr, !isJS), 128 false); 129 } 130 131 return true; 132 } 133 134 NS_IMETHODIMP 135 nsHTMLContentSerializer::AppendElementStart(Element* aElement, 136 Element* aOriginalElement) { 137 NS_ENSURE_ARG(aElement); 138 NS_ENSURE_STATE(mOutput); 139 140 bool forceFormat = false; 141 nsresult rv = NS_OK; 142 if (!CheckElementStart(aElement, forceFormat, *mOutput, rv)) { 143 // When we go to AppendElementEnd for this element, we're going to 144 // MaybeLeaveFromPreContent(). So make sure to MaybeEnterInPreContent() 145 // now, so our PreLevel() doesn't get confused. 146 MaybeEnterInPreContent(aElement); 147 return rv; 148 } 149 150 NS_ENSURE_SUCCESS(rv, rv); 151 152 nsAtom* name = aElement->NodeInfo()->NameAtom(); 153 int32_t ns = aElement->GetNameSpaceID(); 154 155 bool lineBreakBeforeOpen = LineBreakBeforeOpen(ns, name); 156 157 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { 158 if (mColPos && lineBreakBeforeOpen) { 159 NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); 160 } else { 161 NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), 162 NS_ERROR_OUT_OF_MEMORY); 163 } 164 if (!mColPos) { 165 NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); 166 } else if (mAddSpace) { 167 bool result = AppendToString(char16_t(' '), *mOutput); 168 mAddSpace = false; 169 NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); 170 } 171 } else if (mAddSpace) { 172 bool result = AppendToString(char16_t(' '), *mOutput); 173 mAddSpace = false; 174 NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); 175 } else { 176 NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput), 177 NS_ERROR_OUT_OF_MEMORY); 178 } 179 // Always reset to avoid false newlines in case MaybeAddNewlineForRootNode 180 // wasn't called 181 mAddNewlineForRootNode = false; 182 183 NS_ENSURE_TRUE(AppendToString(kLessThan, *mOutput), NS_ERROR_OUT_OF_MEMORY); 184 185 NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput), 186 NS_ERROR_OUT_OF_MEMORY); 187 188 MaybeEnterInPreContent(aElement); 189 190 // for block elements, we increase the indentation 191 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) 192 NS_ENSURE_TRUE(IncrIndentation(name), NS_ERROR_OUT_OF_MEMORY); 193 194 // Need to keep track of OL and LI elements in order to get ordinal number 195 // for the LI. 196 if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) { 197 // We are copying and current node is an OL; 198 // Store its start attribute value in olState->startVal. 199 nsAutoString start; 200 int32_t startAttrVal = 0; 201 202 aElement->GetAttr(nsGkAtoms::start, start); 203 if (!start.IsEmpty()) { 204 nsresult rv = NS_OK; 205 startAttrVal = start.ToInteger(&rv); 206 // If OL has "start" attribute, first LI element has to start with that 207 // value Therefore subtracting 1 as all the LI elements are incrementing 208 // it before using it; In failure of ToInteger(), default StartAttrValue 209 // to 0. 210 if (NS_SUCCEEDED(rv)) 211 startAttrVal--; 212 else 213 startAttrVal = 0; 214 } 215 mOLStateStack.AppendElement(olState(startAttrVal, true)); 216 } 217 218 if (mIsCopying && name == nsGkAtoms::li && ns == kNameSpaceID_XHTML) { 219 mIsFirstChildOfOL = IsFirstChildOfOL(aOriginalElement); 220 if (mIsFirstChildOfOL) { 221 // If OL is parent of this LI, serialize attributes in different manner. 222 NS_ENSURE_TRUE(SerializeLIValueAttribute(aElement, *mOutput), 223 NS_ERROR_OUT_OF_MEMORY); 224 } 225 } 226 227 // Even LI passed above have to go through this 228 // for serializing attributes other than "value". 229 nsAutoString dummyPrefix; 230 NS_ENSURE_TRUE( 231 SerializeHTMLAttributes(aElement, aOriginalElement, dummyPrefix, u""_ns, 232 name, ns, *mOutput), 233 NS_ERROR_OUT_OF_MEMORY); 234 235 NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), 236 NS_ERROR_OUT_OF_MEMORY); 237 238 if (ns == kNameSpaceID_XHTML && 239 (name == nsGkAtoms::script || name == nsGkAtoms::style || 240 (name == nsGkAtoms::noscript && 241 aElement->OwnerDoc()->IsScriptEnabled()) || 242 name == nsGkAtoms::noframes)) { 243 ++mDisableEntityEncoding; 244 } 245 246 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && 247 LineBreakAfterOpen(ns, name)) { 248 NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); 249 } 250 251 NS_ENSURE_TRUE(AfterElementStart(aElement, aOriginalElement, *mOutput), 252 NS_ERROR_OUT_OF_MEMORY); 253 254 return NS_OK; 255 } 256 257 NS_IMETHODIMP 258 nsHTMLContentSerializer::AppendElementEnd(Element* aElement, 259 Element* aOriginalElement) { 260 NS_ENSURE_ARG(aElement); 261 NS_ENSURE_STATE(mOutput); 262 263 nsAtom* name = aElement->NodeInfo()->NameAtom(); 264 int32_t ns = aElement->GetNameSpaceID(); 265 266 if (ns == kNameSpaceID_XHTML && 267 (name == nsGkAtoms::script || name == nsGkAtoms::style || 268 (name == nsGkAtoms::noscript && 269 aElement->OwnerDoc()->IsScriptEnabled()) || 270 name == nsGkAtoms::noframes)) { 271 --mDisableEntityEncoding; 272 } 273 274 bool forceFormat = !(mFlags & nsIDocumentEncoder::OutputIgnoreMozDirty) && 275 aElement->HasAttr(nsGkAtoms::mozdirty); 276 277 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { 278 DecrIndentation(name); 279 } 280 281 if (name == nsGkAtoms::script) { 282 nsCOMPtr<nsIScriptElement> script = do_QueryInterface(aElement); 283 284 if (ShouldMaintainPreLevel() && script && script->IsMalformed()) { 285 // We're looking at a malformed script tag. This means that the end tag 286 // was missing in the source. Imitate that here by not serializing the end 287 // tag. 288 --PreLevel(); 289 return NS_OK; 290 } 291 } else if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) { 292 NS_ASSERTION((!mOLStateStack.IsEmpty()), "Cannot have an empty OL Stack"); 293 /* Though at this point we must always have an state to be deleted as all 294 the OL opening tags are supposed to push an olState object to the stack*/ 295 if (!mOLStateStack.IsEmpty()) { 296 mOLStateStack.RemoveLastElement(); 297 } 298 } 299 300 if (ns == kNameSpaceID_XHTML) { 301 bool isContainer = 302 nsHTMLElement::IsContainer(nsHTMLTags::CaseSensitiveAtomTagToId(name)); 303 if (!isContainer) { 304 // Keep this in sync with the cleanup at the end of this method. 305 MOZ_ASSERT(name != nsGkAtoms::body); 306 MaybeLeaveFromPreContent(aElement); 307 return NS_OK; 308 } 309 } 310 311 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) { 312 bool lineBreakBeforeClose = LineBreakBeforeClose(ns, name); 313 314 if (mColPos && lineBreakBeforeClose) { 315 NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); 316 } 317 if (!mColPos) { 318 NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY); 319 } else if (mAddSpace) { 320 bool result = AppendToString(char16_t(' '), *mOutput); 321 mAddSpace = false; 322 NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); 323 } 324 } else if (mAddSpace) { 325 bool result = AppendToString(char16_t(' '), *mOutput); 326 mAddSpace = false; 327 NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY); 328 } 329 330 NS_ENSURE_TRUE(AppendToString(kEndTag, *mOutput), NS_ERROR_OUT_OF_MEMORY); 331 NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput), 332 NS_ERROR_OUT_OF_MEMORY); 333 NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput), 334 NS_ERROR_OUT_OF_MEMORY); 335 336 // Keep this cleanup in sync with the IsContainer() early return above. 337 MaybeLeaveFromPreContent(aElement); 338 339 if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() && 340 LineBreakAfterClose(ns, name)) { 341 NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY); 342 } else { 343 MaybeFlagNewlineForRootNode(aElement); 344 } 345 346 if (name == nsGkAtoms::body && ns == kNameSpaceID_XHTML) { 347 --mInBody; 348 } 349 350 return NS_OK; 351 } 352 353 static const uint16_t kValNBSP = 160; 354 355 #define _ 0 356 357 // This table indexes into kEntityStrings[]. 358 const uint8_t nsHTMLContentSerializer::kEntities[] = { 359 // clang-format off 360 _, _, _, _, _, _, _, _, _, _, 361 _, _, _, _, _, _, _, _, _, _, 362 _, _, _, _, _, _, _, _, _, _, 363 _, _, _, _, _, _, _, _, 2, _, 364 _, _, _, _, _, _, _, _, _, _, 365 _, _, _, _, _, _, _, _, _, _, 366 3, _, 4, _, _, _, _, _, _, _, 367 _, _, _, _, _, _, _, _, _, _, 368 _, _, _, _, _, _, _, _, _, _, 369 _, _, _, _, _, _, _, _, _, _, 370 _, _, _, _, _, _, _, _, _, _, 371 _, _, _, _, _, _, _, _, _, _, 372 _, _, _, _, _, _, _, _, _, _, 373 _, _, _, _, _, _, _, _, _, _, 374 _, _, _, _, _, _, _, _, _, _, 375 _, _, _, _, _, _, _, _, _, _, 376 5 377 // clang-format on 378 }; 379 380 // This table indexes into kEntityStrings[]. 381 const uint8_t nsHTMLContentSerializer::kAttrEntities[] = { 382 // clang-format off 383 _, _, _, _, _, _, _, _, _, _, 384 _, _, _, _, _, _, _, _, _, _, 385 _, _, _, _, _, _, _, _, _, _, 386 _, _, _, _, 1, _, _, _, 2, _, 387 _, _, _, _, _, _, _, _, _, _, 388 _, _, _, _, _, _, _, _, _, _, 389 3, _, 4, _, _, _, _, _, _, _, 390 _, _, _, _, _, _, _, _, _, _, 391 _, _, _, _, _, _, _, _, _, _, 392 _, _, _, _, _, _, _, _, _, _, 393 _, _, _, _, _, _, _, _, _, _, 394 _, _, _, _, _, _, _, _, _, _, 395 _, _, _, _, _, _, _, _, _, _, 396 _, _, _, _, _, _, _, _, _, _, 397 _, _, _, _, _, _, _, _, _, _, 398 _, _, _, _, _, _, _, _, _, _, 399 5 400 // clang-format on 401 }; 402 403 #undef _ 404 405 const char* const nsHTMLContentSerializer::kEntityStrings[] = { 406 /* 0 */ nullptr, 407 /* 1 */ """, 408 /* 2 */ "&", 409 /* 3 */ "<", 410 /* 4 */ ">", 411 /* 5 */ " "}; 412 413 bool nsHTMLContentSerializer::AppendAndTranslateEntities( 414 const nsAString& aStr, nsAString& aOutputStr) { 415 if (mBodyOnly && !mInBody) { 416 return true; 417 } 418 419 if (mDisableEntityEncoding) { 420 return aOutputStr.Append(aStr, mozilla::fallible); 421 } 422 423 if (mFlags & (nsIDocumentEncoder::OutputEncodeBasicEntities)) { 424 // Per the API documentation, encode , &, <, >, and " 425 if (mInAttribute) { 426 return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>( 427 aStr, aOutputStr, kAttrEntities, kEntityStrings); 428 } 429 430 return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>( 431 aStr, aOutputStr, kEntities, kEntityStrings); 432 } 433 434 // We don't want to call into our superclass 2-arg version of 435 // AppendAndTranslateEntities, because it wants to encode more characters 436 // than we do. Use our tables, but avoid encoding by passing in a 437 // smaller max index. This will only encode &, <, >, and ". 438 if (mInAttribute) { 439 return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>( 440 aStr, aOutputStr, kAttrEntities, kEntityStrings); 441 } 442 443 return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>( 444 aStr, aOutputStr, kEntities, kEntityStrings); 445 }