tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

xmlparser.h (9005B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2005, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  xmlparser.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004jul21
     16 *   created by: Andy Heninger
     17 *
     18 * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
     19 * Not suitable for production use. Not supported.
     20 * Not conformant. Not efficient.
     21 * But very small.
     22 */
     23 
     24 #ifndef __XMLPARSER_H__
     25 #define __XMLPARSER_H__
     26 
     27 #include "unicode/uobject.h"
     28 #include "unicode/unistr.h"
     29 #include "unicode/regex.h"
     30 #include "uvector.h"
     31 #include "hash.h"
     32 
     33 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     34 
     35 enum UXMLNodeType {
     36    /** Node type string (text contents), stored as a UnicodeString. */
     37    UXML_NODE_TYPE_STRING,
     38    /** Node type element, stored as a UXMLElement. */
     39    UXML_NODE_TYPE_ELEMENT,
     40    UXML_NODE_TYPE_COUNT
     41 };
     42 
     43 U_NAMESPACE_BEGIN
     44 
     45 class UXMLParser;
     46 
     47 /**
     48 * This class represents an element node in a parsed XML tree.
     49 */
     50 class U_TOOLUTIL_API UXMLElement : public UObject {
     51 public:
     52    /**
     53     * Destructor.
     54     */
     55    virtual ~UXMLElement();
     56 
     57    /**
     58     * Get the tag name of this element.
     59     */
     60    const UnicodeString &getTagName() const;
     61    /**
     62     * Get the text contents of the element.
     63     * Append the contents of all text child nodes.
     64     * @param recurse If true, also recursively appends the contents of all
     65     *        text child nodes of element children.
     66     * @return The text contents.
     67     */
     68    UnicodeString getText(UBool recurse) const;
     69    /**
     70     * Get the number of attributes.
     71     */
     72    int32_t countAttributes() const;
     73    /**
     74     * Get the i-th attribute.
     75     * @param i Index of the attribute.
     76     * @param name Output parameter, receives the attribute name.
     77     * @param value Output parameter, receives the attribute value.
     78     * @return A pointer to the attribute value (may be &value or a pointer to an
     79     *         internal string object), or nullptr if i is out of bounds.
     80     */
     81    const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
     82    /**
     83     * Get the value of the attribute with the given name.
     84     * @param name Attribute name to be looked up.
     85     * @return A pointer to the attribute value, or nullptr if this element
     86     * does not have this attribute.
     87     */
     88    const UnicodeString *getAttribute(const UnicodeString &name) const;
     89    /**
     90     * Get the number of child nodes.
     91     */
     92    int32_t countChildren() const;
     93    /**
     94     * Get the i-th child node.
     95     * @param i Index of the child node.
     96     * @param type The child node type.
     97     * @return A pointer to the child node object, or nullptr if i is out of bounds.
     98     */
     99    const UObject *getChild(int32_t i, UXMLNodeType &type) const;
    100    /**
    101     * Get the next child element node, skipping non-element child nodes.
    102     * @param i Enumeration index; initialize to 0 before getting the first child element.
    103     * @return A pointer to the next child element, or nullptr if there is none.
    104     */
    105    const UXMLElement *nextChildElement(int32_t &i) const;
    106    /**
    107     * Get the immediate child element with the given name.
    108     * If there are multiple child elements with this name, then return
    109     * the first one.
    110     * @param name Element name to be looked up.
    111     * @return A pointer to the element node, or nullptr if this element
    112     * does not have this immediate child element.
    113     */
    114    const UXMLElement *getChildElement(const UnicodeString &name) const;
    115 
    116    /**
    117     * ICU "poor man's RTTI", returns a UClassID for the actual class.
    118     */
    119    virtual UClassID getDynamicClassID() const override;
    120 
    121    /**
    122     * ICU "poor man's RTTI", returns a UClassID for this class.
    123     */
    124    static UClassID U_EXPORT2 getStaticClassID();
    125 
    126 private:
    127    // prevent default construction etc.
    128    UXMLElement();
    129    UXMLElement(const UXMLElement &other);
    130    UXMLElement &operator=(const UXMLElement &other);
    131 
    132    void appendText(UnicodeString &text, UBool recurse) const;
    133 
    134    friend class UXMLParser;
    135 
    136    UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
    137 
    138    const UXMLParser *fParser;
    139    const UnicodeString *fName;          // The tag name of this element (owned by the UXMLParser)
    140    UnicodeString       fContent;        // The text content of this node.  All element content is 
    141                                         //   concatenated even when there are intervening nested elements
    142                                         //   (which doesn't happen with most xml files we care about)
    143                                         //   Sections of content containing only white space are dropped,
    144                                         //   which gets rid  the bogus white space content from
    145                                         //   elements which are primarily containers for nested elements.
    146    UVector             fAttNames;       // A vector containing the names of this element's attributes
    147                                         //    The names are UnicodeString objects, owned by the UXMLParser.
    148    UVector             fAttValues;      // A vector containing the attribute values for
    149                                         //    this element's attributes.  The order is the same
    150                                         //    as that of the attribute name vector.
    151 
    152    UVector             fChildren;       // The child nodes of this element (a Vector)
    153 
    154    UXMLElement        *fParent;         // A pointer to the parent element of this element.
    155 };
    156 
    157 /**
    158 * A simple XML parser; it is neither efficient nor conformant and only useful for
    159 * restricted types of XML documents.
    160 *
    161 * The parse methods parse whole documents and return the parse trees via their
    162 * root elements.
    163 */
    164 class U_TOOLUTIL_API UXMLParser : public UObject {
    165 public:
    166    /**
    167     * Create an XML parser.
    168     */
    169    static UXMLParser *createParser(UErrorCode &errorCode);
    170    /**
    171     * Destructor.
    172     */
    173    virtual ~UXMLParser();
    174 
    175    /**
    176     * Parse an XML document, create the entire document tree, and
    177     * return a pointer to the root element of the parsed tree.
    178     * The caller must delete the element.
    179     */
    180    UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
    181    /**
    182     * Parse an XML file, create the entire document tree, and
    183     * return a pointer to the root element of the parsed tree.
    184     * The caller must delete the element.
    185     */
    186    UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
    187 
    188    /**
    189     * ICU "poor man's RTTI", returns a UClassID for the actual class.
    190     */
    191    virtual UClassID getDynamicClassID() const override;
    192 
    193    /**
    194     * ICU "poor man's RTTI", returns a UClassID for this class.
    195     */
    196    static UClassID U_EXPORT2 getStaticClassID();
    197 
    198 private:
    199    // prevent default construction etc.
    200    UXMLParser();
    201    UXMLParser(const UXMLParser &other);
    202    UXMLParser &operator=(const UXMLParser &other);
    203 
    204    // constructor
    205    UXMLParser(UErrorCode &status);
    206 
    207    void           parseMisc(UErrorCode &status);
    208    UXMLElement   *createElement(RegexMatcher &mEl, UErrorCode &status);
    209    void           error(const char *message, UErrorCode &status);
    210    UnicodeString  scanContent(UErrorCode &status);
    211    void           replaceCharRefs(UnicodeString &s, UErrorCode &status);
    212 
    213    const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
    214 public:
    215    // public for UXMLElement only
    216    const UnicodeString *findName(const UnicodeString &s) const;
    217 private:
    218 
    219    // There is one ICU regex matcher for each of the major XML syntax items
    220    //  that are recognized.
    221    RegexMatcher mXMLDecl;
    222    RegexMatcher mXMLComment;
    223    RegexMatcher mXMLSP;
    224    RegexMatcher mXMLDoctype;
    225    RegexMatcher mXMLPI;
    226    RegexMatcher mXMLElemStart;
    227    RegexMatcher mXMLElemEnd;
    228    RegexMatcher mXMLElemEmpty;
    229    RegexMatcher mXMLCharData;
    230    RegexMatcher mAttrValue;
    231    RegexMatcher mAttrNormalizer;
    232    RegexMatcher mNewLineNormalizer;
    233    RegexMatcher mAmps;
    234 
    235    Hashtable             fNames;           // interned element/attribute name strings
    236    UStack                fElementStack;    // Stack holds the parent elements when nested
    237                                            //    elements are being parsed.  All items on this
    238                                            //    stack are of type UXMLElement.
    239    int32_t               fPos;             // String index of the current scan position in
    240                                            //    xml source (in fSrc).
    241    UnicodeString         fOneLF;
    242 };
    243 
    244 U_NAMESPACE_END
    245 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
    246 
    247 #endif