tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

nsHTMLContentSerializer.cpp (15334B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 /*
      8 * nsIContentSerializer implementation that can be used with an
      9 * nsIDocumentEncoder to convert an HTML (not XHTML!) DOM to an HTML
     10 * string that could be parsed into more or less the original DOM.
     11 */
     12 
     13 #include "nsHTMLContentSerializer.h"
     14 
     15 #include "mozilla/dom/Document.h"
     16 #include "mozilla/dom/Element.h"
     17 #include "nsAttrName.h"
     18 #include "nsCRT.h"
     19 #include "nsContentUtils.h"
     20 #include "nsElementTable.h"
     21 #include "nsEscape.h"
     22 #include "nsGkAtoms.h"
     23 #include "nsIContent.h"
     24 #include "nsIDocumentEncoder.h"
     25 #include "nsIScriptElement.h"
     26 #include "nsIURI.h"
     27 #include "nsNameSpaceManager.h"
     28 #include "nsNetUtil.h"
     29 #include "nsParserConstants.h"
     30 #include "nsString.h"
     31 #include "nsUnicharUtils.h"
     32 
     33 using namespace mozilla::dom;
     34 
     35 nsresult NS_NewHTMLContentSerializer(nsIContentSerializer** aSerializer) {
     36  RefPtr<nsHTMLContentSerializer> it = new nsHTMLContentSerializer();
     37  it.forget(aSerializer);
     38  return NS_OK;
     39 }
     40 
     41 nsHTMLContentSerializer::nsHTMLContentSerializer() { mIsHTMLSerializer = true; }
     42 
     43 nsHTMLContentSerializer::~nsHTMLContentSerializer() = default;
     44 
     45 NS_IMETHODIMP
     46 nsHTMLContentSerializer::AppendDocumentStart(Document* aDocument) {
     47  return NS_OK;
     48 }
     49 
     50 bool nsHTMLContentSerializer::SerializeHTMLAttributes(
     51    Element* aElement, Element* aOriginalElement, nsAString& aTagPrefix,
     52    const nsAString& aTagNamespaceURI, nsAtom* aTagName, int32_t aNamespace,
     53    nsAString& aStr) {
     54  MaybeSerializeIsValue(aElement, aStr);
     55 
     56  int32_t count = aElement->GetAttrCount();
     57  if (!count) return true;
     58 
     59  nsresult rv;
     60  nsAutoString valueStr;
     61 
     62  for (int32_t index = 0; index < count; index++) {
     63    const nsAttrName* name = aElement->GetAttrNameAt(index);
     64    int32_t namespaceID = name->NamespaceID();
     65    nsAtom* attrName = name->LocalName();
     66 
     67    // Filter out any attribute starting with [-|_]moz
     68    nsDependentAtomString attrNameStr(attrName);
     69    if (StringBeginsWith(attrNameStr, u"_moz"_ns) ||
     70        StringBeginsWith(attrNameStr, u"-moz"_ns)) {
     71      continue;
     72    }
     73    aElement->GetAttr(namespaceID, attrName, valueStr);
     74 
     75    if (mIsCopying && mIsFirstChildOfOL && aTagName == nsGkAtoms::li &&
     76        aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::value &&
     77        namespaceID == kNameSpaceID_None) {
     78      // This is handled separately in SerializeLIValueAttribute()
     79      continue;
     80    }
     81    bool isJS = IsJavaScript(aElement, attrName, namespaceID, valueStr);
     82 
     83    if (((attrName == nsGkAtoms::href && (namespaceID == kNameSpaceID_None ||
     84                                          namespaceID == kNameSpaceID_XLink)) ||
     85         (attrName == nsGkAtoms::src && namespaceID == kNameSpaceID_None))) {
     86      // Make all links absolute when converting only the selection:
     87      if (mFlags & nsIDocumentEncoder::OutputAbsoluteLinks) {
     88        // Would be nice to handle OBJECT tags, but that gets more complicated
     89        // since we have to search the tag list for CODEBASE as well. For now,
     90        // just leave them relative.
     91        nsIURI* uri = aElement->GetBaseURI();
     92        if (uri) {
     93          nsAutoString absURI;
     94          rv = NS_MakeAbsoluteURI(absURI, valueStr, uri);
     95          if (NS_SUCCEEDED(rv)) {
     96            valueStr = absURI;
     97          }
     98        }
     99      }
    100    }
    101 
    102    if (mRewriteEncodingDeclaration && aTagName == nsGkAtoms::meta &&
    103        aNamespace == kNameSpaceID_XHTML && attrName == nsGkAtoms::content &&
    104        namespaceID == kNameSpaceID_None) {
    105      // If we're serializing a <meta http-equiv="content-type">,
    106      // use the proper value, rather than what's in the document.
    107      nsAutoString header;
    108      aElement->GetAttr(nsGkAtoms::httpEquiv, header);
    109      if (header.LowerCaseEqualsLiteral("content-type")) {
    110        valueStr = u"text/html; charset="_ns + NS_ConvertASCIItoUTF16(mCharset);
    111      }
    112    }
    113 
    114    nsDependentAtomString nameStr(attrName);
    115    nsAutoString prefix;
    116    if (namespaceID == kNameSpaceID_XML) {
    117      prefix.AssignLiteral(u"xml");
    118    } else if (namespaceID == kNameSpaceID_XLink) {
    119      prefix.AssignLiteral(u"xlink");
    120    }
    121 
    122    // Expand shorthand attribute.
    123    if (aNamespace == kNameSpaceID_XHTML && namespaceID == kNameSpaceID_None &&
    124        IsShorthandAttr(attrName, aTagName) && valueStr.IsEmpty()) {
    125      valueStr = nameStr;
    126    }
    127    NS_ENSURE_TRUE(SerializeAttr(prefix, nameStr, valueStr, aStr, !isJS),
    128                   false);
    129  }
    130 
    131  return true;
    132 }
    133 
    134 NS_IMETHODIMP
    135 nsHTMLContentSerializer::AppendElementStart(Element* aElement,
    136                                            Element* aOriginalElement) {
    137  NS_ENSURE_ARG(aElement);
    138  NS_ENSURE_STATE(mOutput);
    139 
    140  bool forceFormat = false;
    141  nsresult rv = NS_OK;
    142  if (!CheckElementStart(aElement, forceFormat, *mOutput, rv)) {
    143    // When we go to AppendElementEnd for this element, we're going to
    144    // MaybeLeaveFromPreContent().  So make sure to MaybeEnterInPreContent()
    145    // now, so our PreLevel() doesn't get confused.
    146    MaybeEnterInPreContent(aElement);
    147    return rv;
    148  }
    149 
    150  NS_ENSURE_SUCCESS(rv, rv);
    151 
    152  nsAtom* name = aElement->NodeInfo()->NameAtom();
    153  int32_t ns = aElement->GetNameSpaceID();
    154 
    155  bool lineBreakBeforeOpen = LineBreakBeforeOpen(ns, name);
    156 
    157  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) {
    158    if (mColPos && lineBreakBeforeOpen) {
    159      NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    160    } else {
    161      NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput),
    162                     NS_ERROR_OUT_OF_MEMORY);
    163    }
    164    if (!mColPos) {
    165      NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    166    } else if (mAddSpace) {
    167      bool result = AppendToString(char16_t(' '), *mOutput);
    168      mAddSpace = false;
    169      NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY);
    170    }
    171  } else if (mAddSpace) {
    172    bool result = AppendToString(char16_t(' '), *mOutput);
    173    mAddSpace = false;
    174    NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY);
    175  } else {
    176    NS_ENSURE_TRUE(MaybeAddNewlineForRootNode(*mOutput),
    177                   NS_ERROR_OUT_OF_MEMORY);
    178  }
    179  // Always reset to avoid false newlines in case MaybeAddNewlineForRootNode
    180  // wasn't called
    181  mAddNewlineForRootNode = false;
    182 
    183  NS_ENSURE_TRUE(AppendToString(kLessThan, *mOutput), NS_ERROR_OUT_OF_MEMORY);
    184 
    185  NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput),
    186                 NS_ERROR_OUT_OF_MEMORY);
    187 
    188  MaybeEnterInPreContent(aElement);
    189 
    190  // for block elements, we increase the indentation
    191  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel())
    192    NS_ENSURE_TRUE(IncrIndentation(name), NS_ERROR_OUT_OF_MEMORY);
    193 
    194  // Need to keep track of OL and LI elements in order to get ordinal number
    195  // for the LI.
    196  if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) {
    197    // We are copying and current node is an OL;
    198    // Store its start attribute value in olState->startVal.
    199    nsAutoString start;
    200    int32_t startAttrVal = 0;
    201 
    202    aElement->GetAttr(nsGkAtoms::start, start);
    203    if (!start.IsEmpty()) {
    204      nsresult rv = NS_OK;
    205      startAttrVal = start.ToInteger(&rv);
    206      // If OL has "start" attribute, first LI element has to start with that
    207      // value Therefore subtracting 1 as all the LI elements are incrementing
    208      // it before using it; In failure of ToInteger(), default StartAttrValue
    209      // to 0.
    210      if (NS_SUCCEEDED(rv))
    211        startAttrVal--;
    212      else
    213        startAttrVal = 0;
    214    }
    215    mOLStateStack.AppendElement(olState(startAttrVal, true));
    216  }
    217 
    218  if (mIsCopying && name == nsGkAtoms::li && ns == kNameSpaceID_XHTML) {
    219    mIsFirstChildOfOL = IsFirstChildOfOL(aOriginalElement);
    220    if (mIsFirstChildOfOL) {
    221      // If OL is parent of this LI, serialize attributes in different manner.
    222      NS_ENSURE_TRUE(SerializeLIValueAttribute(aElement, *mOutput),
    223                     NS_ERROR_OUT_OF_MEMORY);
    224    }
    225  }
    226 
    227  // Even LI passed above have to go through this
    228  // for serializing attributes other than "value".
    229  nsAutoString dummyPrefix;
    230  NS_ENSURE_TRUE(
    231      SerializeHTMLAttributes(aElement, aOriginalElement, dummyPrefix, u""_ns,
    232                              name, ns, *mOutput),
    233      NS_ERROR_OUT_OF_MEMORY);
    234 
    235  NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput),
    236                 NS_ERROR_OUT_OF_MEMORY);
    237 
    238  if (ns == kNameSpaceID_XHTML &&
    239      (name == nsGkAtoms::script || name == nsGkAtoms::style ||
    240       (name == nsGkAtoms::noscript &&
    241        aElement->OwnerDoc()->IsScriptEnabled()) ||
    242       name == nsGkAtoms::noframes)) {
    243    ++mDisableEntityEncoding;
    244  }
    245 
    246  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() &&
    247      LineBreakAfterOpen(ns, name)) {
    248    NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    249  }
    250 
    251  NS_ENSURE_TRUE(AfterElementStart(aElement, aOriginalElement, *mOutput),
    252                 NS_ERROR_OUT_OF_MEMORY);
    253 
    254  return NS_OK;
    255 }
    256 
    257 NS_IMETHODIMP
    258 nsHTMLContentSerializer::AppendElementEnd(Element* aElement,
    259                                          Element* aOriginalElement) {
    260  NS_ENSURE_ARG(aElement);
    261  NS_ENSURE_STATE(mOutput);
    262 
    263  nsAtom* name = aElement->NodeInfo()->NameAtom();
    264  int32_t ns = aElement->GetNameSpaceID();
    265 
    266  if (ns == kNameSpaceID_XHTML &&
    267      (name == nsGkAtoms::script || name == nsGkAtoms::style ||
    268       (name == nsGkAtoms::noscript &&
    269        aElement->OwnerDoc()->IsScriptEnabled()) ||
    270       name == nsGkAtoms::noframes)) {
    271    --mDisableEntityEncoding;
    272  }
    273 
    274  bool forceFormat = !(mFlags & nsIDocumentEncoder::OutputIgnoreMozDirty) &&
    275                     aElement->HasAttr(nsGkAtoms::mozdirty);
    276 
    277  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) {
    278    DecrIndentation(name);
    279  }
    280 
    281  if (name == nsGkAtoms::script) {
    282    nsCOMPtr<nsIScriptElement> script = do_QueryInterface(aElement);
    283 
    284    if (ShouldMaintainPreLevel() && script && script->IsMalformed()) {
    285      // We're looking at a malformed script tag. This means that the end tag
    286      // was missing in the source. Imitate that here by not serializing the end
    287      // tag.
    288      --PreLevel();
    289      return NS_OK;
    290    }
    291  } else if (mIsCopying && name == nsGkAtoms::ol && ns == kNameSpaceID_XHTML) {
    292    NS_ASSERTION((!mOLStateStack.IsEmpty()), "Cannot have an empty OL Stack");
    293    /* Though at this point we must always have an state to be deleted as all
    294    the OL opening tags are supposed to push an olState object to the stack*/
    295    if (!mOLStateStack.IsEmpty()) {
    296      mOLStateStack.RemoveLastElement();
    297    }
    298  }
    299 
    300  if (ns == kNameSpaceID_XHTML) {
    301    bool isContainer =
    302        nsHTMLElement::IsContainer(nsHTMLTags::CaseSensitiveAtomTagToId(name));
    303    if (!isContainer) {
    304      // Keep this in sync with the cleanup at the end of this method.
    305      MOZ_ASSERT(name != nsGkAtoms::body);
    306      MaybeLeaveFromPreContent(aElement);
    307      return NS_OK;
    308    }
    309  }
    310 
    311  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel()) {
    312    bool lineBreakBeforeClose = LineBreakBeforeClose(ns, name);
    313 
    314    if (mColPos && lineBreakBeforeClose) {
    315      NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    316    }
    317    if (!mColPos) {
    318      NS_ENSURE_TRUE(AppendIndentation(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    319    } else if (mAddSpace) {
    320      bool result = AppendToString(char16_t(' '), *mOutput);
    321      mAddSpace = false;
    322      NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY);
    323    }
    324  } else if (mAddSpace) {
    325    bool result = AppendToString(char16_t(' '), *mOutput);
    326    mAddSpace = false;
    327    NS_ENSURE_TRUE(result, NS_ERROR_OUT_OF_MEMORY);
    328  }
    329 
    330  NS_ENSURE_TRUE(AppendToString(kEndTag, *mOutput), NS_ERROR_OUT_OF_MEMORY);
    331  NS_ENSURE_TRUE(AppendToString(nsDependentAtomString(name), *mOutput),
    332                 NS_ERROR_OUT_OF_MEMORY);
    333  NS_ENSURE_TRUE(AppendToString(kGreaterThan, *mOutput),
    334                 NS_ERROR_OUT_OF_MEMORY);
    335 
    336  // Keep this cleanup in sync with the IsContainer() early return above.
    337  MaybeLeaveFromPreContent(aElement);
    338 
    339  if ((mDoFormat || forceFormat) && !mDoRaw && !PreLevel() &&
    340      LineBreakAfterClose(ns, name)) {
    341    NS_ENSURE_TRUE(AppendNewLineToString(*mOutput), NS_ERROR_OUT_OF_MEMORY);
    342  } else {
    343    MaybeFlagNewlineForRootNode(aElement);
    344  }
    345 
    346  if (name == nsGkAtoms::body && ns == kNameSpaceID_XHTML) {
    347    --mInBody;
    348  }
    349 
    350  return NS_OK;
    351 }
    352 
    353 static const uint16_t kValNBSP = 160;
    354 
    355 #define _ 0
    356 
    357 // This table indexes into kEntityStrings[].
    358 const uint8_t nsHTMLContentSerializer::kEntities[] = {
    359    // clang-format off
    360  _, _, _, _, _, _, _, _, _, _,
    361  _, _, _, _, _, _, _, _, _, _,
    362  _, _, _, _, _, _, _, _, _, _,
    363  _, _, _, _, _, _, _, _, 2, _,
    364  _, _, _, _, _, _, _, _, _, _,
    365  _, _, _, _, _, _, _, _, _, _,
    366  3, _, 4, _, _, _, _, _, _, _,
    367  _, _, _, _, _, _, _, _, _, _,
    368  _, _, _, _, _, _, _, _, _, _,
    369  _, _, _, _, _, _, _, _, _, _,
    370  _, _, _, _, _, _, _, _, _, _,
    371  _, _, _, _, _, _, _, _, _, _,
    372  _, _, _, _, _, _, _, _, _, _,
    373  _, _, _, _, _, _, _, _, _, _,
    374  _, _, _, _, _, _, _, _, _, _,
    375  _, _, _, _, _, _, _, _, _, _,
    376  5
    377    // clang-format on
    378 };
    379 
    380 // This table indexes into kEntityStrings[].
    381 const uint8_t nsHTMLContentSerializer::kAttrEntities[] = {
    382    // clang-format off
    383  _, _, _, _, _, _, _, _, _, _,
    384  _, _, _, _, _, _, _, _, _, _,
    385  _, _, _, _, _, _, _, _, _, _,
    386  _, _, _, _, 1, _, _, _, 2, _,
    387  _, _, _, _, _, _, _, _, _, _,
    388  _, _, _, _, _, _, _, _, _, _,
    389  3, _, 4, _, _, _, _, _, _, _,
    390  _, _, _, _, _, _, _, _, _, _,
    391  _, _, _, _, _, _, _, _, _, _,
    392  _, _, _, _, _, _, _, _, _, _,
    393  _, _, _, _, _, _, _, _, _, _,
    394  _, _, _, _, _, _, _, _, _, _,
    395  _, _, _, _, _, _, _, _, _, _,
    396  _, _, _, _, _, _, _, _, _, _,
    397  _, _, _, _, _, _, _, _, _, _,
    398  _, _, _, _, _, _, _, _, _, _,
    399  5
    400    // clang-format on
    401 };
    402 
    403 #undef _
    404 
    405 const char* const nsHTMLContentSerializer::kEntityStrings[] = {
    406    /* 0 */ nullptr,
    407    /* 1 */ "&quot;",
    408    /* 2 */ "&amp;",
    409    /* 3 */ "&lt;",
    410    /* 4 */ "&gt;",
    411    /* 5 */ "&nbsp;"};
    412 
    413 bool nsHTMLContentSerializer::AppendAndTranslateEntities(
    414    const nsAString& aStr, nsAString& aOutputStr) {
    415  if (mBodyOnly && !mInBody) {
    416    return true;
    417  }
    418 
    419  if (mDisableEntityEncoding) {
    420    return aOutputStr.Append(aStr, mozilla::fallible);
    421  }
    422 
    423  if (mFlags & (nsIDocumentEncoder::OutputEncodeBasicEntities)) {
    424    // Per the API documentation, encode &nbsp;, &amp;, &lt;, &gt;, and &quot;
    425    if (mInAttribute) {
    426      return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>(
    427          aStr, aOutputStr, kAttrEntities, kEntityStrings);
    428    }
    429 
    430    return nsXMLContentSerializer::AppendAndTranslateEntities<kValNBSP>(
    431        aStr, aOutputStr, kEntities, kEntityStrings);
    432  }
    433 
    434  // We don't want to call into our superclass 2-arg version of
    435  // AppendAndTranslateEntities, because it wants to encode more characters
    436  // than we do.  Use our tables, but avoid encoding &nbsp; by passing in a
    437  // smaller max index.  This will only encode &amp;, &lt;, &gt;, and &quot;.
    438  if (mInAttribute) {
    439    return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>(
    440        aStr, aOutputStr, kAttrEntities, kEntityStrings);
    441  }
    442 
    443  return nsXMLContentSerializer::AppendAndTranslateEntities<kGTVal>(
    444      aStr, aOutputStr, kEntities, kEntityStrings);
    445 }