tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

xmlparser.cpp (28930B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2010, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  xmlparser.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004jul21
     16 *   created by: Andy Heninger
     17 */
     18 
     19 #include <stdio.h>
     20 #include "unicode/uchar.h"
     21 #include "unicode/ucnv.h"
     22 #include "unicode/regex.h"
     23 #include "filestrm.h"
     24 #include "xmlparser.h"
     25 
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
     27 
     28 // character constants
     29 enum {
     30    x_QUOT=0x22,
     31    x_AMP=0x26,
     32    x_APOS=0x27,
     33    x_LT=0x3c,
     34    x_GT=0x3e,
     35    x_l=0x6c
     36 };
     37 
     38 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
     39 
     40 // XML #4
     41 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
     42                    "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
     43                    "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
     44                    "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
     45 
     46 //  XML #5
     47 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
     48 
     49 //  XML #6
     50 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
     51 
     52 U_NAMESPACE_BEGIN
     53 
     54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
     55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
     56 
     57 //
     58 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
     59 //                             used for parsing.
     60 //
     61 UXMLParser::UXMLParser(UErrorCode &status) :
     62      //  XML Declaration.  XML Production #23.
     63      //      example:  "<?xml version=1.0 encoding="utf-16" ?>
     64      //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
     65      //            allow for a possible leading BOM.
     66      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
     67      
     68      //  XML Comment   production #15
     69      //     example:  "<!-- whatever -->
     70      //       note, does not detect an illegal "--" within comments
     71      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
     72      
     73      //  XML Spaces
     74      //      production [3]
     75      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
     76      
     77      //  XML Doctype decl  production #28
     78      //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
     79      //       or      "<!DOCTYPE foo [internal dtd]>
     80      //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
     81      //           Some internal dtd subsets could confuse this simple-minded
     82      //           attempt at skipping over them, specifically, occurrences
     83      //           of closing square brackets.  These could appear in comments, 
     84      //           or in parameter entity declarations, for example.
     85      mXMLDoctype(UnicodeString(
     86           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
     87           ), 0, status),
     88      
     89      //  XML PI     production #16
     90      //     example   "<?target stuff?>
     91      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
     92      
     93      //  XML Element Start   Productions #40, #41
     94      //          example   <foo att1='abc'  att2="d e f" >
     95      //      capture #1:  the tag name
     96      //
     97      mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
     98          "(?:" 
     99                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    100                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    101          ")*"                                                             //   * for zero or more attributes.
    102          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
    103      
    104      //  XML Element End     production #42
    105      //     example   </foo>
    106      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
    107      
    108      // XML Element Empty    production #44
    109      //     example   <foo att1="abc"   att2="d e f" />
    110      mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
    111          "(?:" 
    112                XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
    113                "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
    114          ")*"                                                             //   * for zero or more attributes.
    115          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
    116      
    117 
    118      // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
    119      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
    120 
    121      // Attribute name = "value".  XML Productions 10, 40/41
    122      //  Capture group 1 is name, 
    123      //                2 is the attribute value, including the quotes.
    124      //
    125      //   Note that attributes are scanned twice.  The first time is with
    126      //        the regex for an entire element start.  There, the attributes
    127      //        are checked syntactically, but not separated out one by one.
    128      //        Here, we match a single attribute, and make its name and
    129      //        attribute value available to the parser code.
    130      mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
    131         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
    132 
    133 
    134      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
    135 
    136      // Match any of the new-line sequences in content.
    137      //   All are changed to \u000a.
    138      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
    139 
    140      // & char references
    141      //   We will figure out what we've got based on which capture group has content.
    142      //   The last one is a catchall for unrecognized entity references..
    143      //             1     2     3      4      5           6                    7          8
    144      mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
    145                0, status),
    146 
    147      fNames(status),
    148      fElementStack(status),
    149      fOneLF(static_cast<char16_t>(0x0a)) // Plain new-line string, used in new line normalization.
    150      {
    151      }
    152 
    153 UXMLParser *
    154 UXMLParser::createParser(UErrorCode &errorCode) {
    155    if (U_FAILURE(errorCode)) {
    156        return nullptr;
    157    } else {
    158        return new UXMLParser(errorCode);
    159    }
    160 }
    161 
    162 UXMLParser::~UXMLParser() {}
    163 
    164 UXMLElement *
    165 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
    166    char bytes[4096], charsetBuffer[100];
    167    FileStream *f;
    168    const char *charset, *pb;
    169    UnicodeString src;
    170    UConverter *cnv;
    171    char16_t *buffer, *pu;
    172    int32_t fileLength, bytesLength, length, capacity;
    173    UBool flush;
    174 
    175    if(U_FAILURE(errorCode)) {
    176        return nullptr;
    177    }
    178 
    179    f=T_FileStream_open(filename, "rb");
    180    if(f==nullptr) {
    181        errorCode=U_FILE_ACCESS_ERROR;
    182        return nullptr;
    183    }
    184 
    185    bytesLength = T_FileStream_read(f, bytes, static_cast<int32_t>(sizeof(bytes)));
    186    if (bytesLength < static_cast<int32_t>(sizeof(bytes))) {
    187        // we have already read the entire file
    188        fileLength=bytesLength;
    189    } else {
    190        // get the file length
    191        fileLength=T_FileStream_size(f);
    192    }
    193 
    194    /*
    195     * get the charset:
    196     * 1. Unicode signature
    197     * 2. treat as ISO-8859-1 and read XML encoding="charser"
    198     * 3. default to UTF-8
    199     */
    200    charset=ucnv_detectUnicodeSignature(bytes, bytesLength, nullptr, &errorCode);
    201    if(U_SUCCESS(errorCode) && charset!=nullptr) {
    202        // open converter according to Unicode signature
    203        cnv=ucnv_open(charset, &errorCode);
    204    } else {
    205        // read as Latin-1 and parse the XML declaration and encoding
    206        cnv=ucnv_open("ISO-8859-1", &errorCode);
    207        if(U_FAILURE(errorCode)) {
    208            // unexpected error opening Latin-1 converter
    209            goto exit;
    210        }
    211 
    212        buffer=toUCharPtr(src.getBuffer(bytesLength));
    213        if(buffer==nullptr) {
    214            // unexpected failure to reserve some string capacity
    215            errorCode=U_MEMORY_ALLOCATION_ERROR;
    216            goto exit;
    217        }
    218        pb=bytes;
    219        pu=buffer;
    220        ucnv_toUnicode(
    221            cnv,
    222            &pu, buffer+src.getCapacity(),
    223            &pb, bytes+bytesLength,
    224            nullptr, true, &errorCode);
    225        src.releaseBuffer(U_SUCCESS(errorCode) ? static_cast<int32_t>(pu - buffer) : 0);
    226        ucnv_close(cnv);
    227        cnv=nullptr;
    228        if(U_FAILURE(errorCode)) {
    229            // unexpected error in conversion from Latin-1
    230            src.remove();
    231            goto exit;
    232        }
    233 
    234        // parse XML declaration
    235        if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
    236            int32_t declEnd=mXMLDecl.end(errorCode);
    237            // go beyond <?xml
    238            int32_t pos = src.indexOf(static_cast<char16_t>(x_l)) + 1;
    239 
    240            mAttrValue.reset(src);
    241            while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
    242                UnicodeString attName  = mAttrValue.group(1, errorCode);
    243                UnicodeString attValue = mAttrValue.group(2, errorCode);
    244 
    245                // Trim the quotes from the att value.  These are left over from the original regex
    246                //   that parsed the attribute, which couldn't conveniently strip them.
    247                attValue.remove(0,1);                    // one char from the beginning
    248                attValue.truncate(attValue.length()-1);  // and one from the end.
    249 
    250                if(attName==UNICODE_STRING("encoding", 8)) {
    251                    length = attValue.extract(0, 0x7fffffff, charsetBuffer, static_cast<int32_t>(sizeof(charsetBuffer)));
    252                    charset=charsetBuffer;
    253                    break;
    254                }
    255                pos = mAttrValue.end(2, errorCode);
    256            }
    257 
    258            if(charset==nullptr) {
    259                // default to UTF-8
    260                charset="UTF-8";
    261            }
    262            cnv=ucnv_open(charset, &errorCode);
    263        }
    264    }
    265 
    266    if(U_FAILURE(errorCode)) {
    267        // unable to open the converter
    268        goto exit;
    269    }
    270 
    271    // convert the file contents
    272    capacity=fileLength;        // estimated capacity
    273    src.getBuffer(capacity);
    274    src.releaseBuffer(0);       // zero length
    275    flush=false;
    276    for(;;) {
    277        // convert contents of bytes[bytesLength]
    278        pb=bytes;
    279        for(;;) {
    280            length=src.length();
    281            buffer=toUCharPtr(src.getBuffer(capacity));
    282            if(buffer==nullptr) {
    283                // unexpected failure to reserve some string capacity
    284                errorCode=U_MEMORY_ALLOCATION_ERROR;
    285                goto exit;
    286            }
    287 
    288            pu=buffer+length;
    289            ucnv_toUnicode(
    290                cnv, &pu, buffer+src.getCapacity(),
    291                &pb, bytes+bytesLength,
    292                nullptr, false, &errorCode);
    293            src.releaseBuffer(U_SUCCESS(errorCode) ? static_cast<int32_t>(pu - buffer) : 0);
    294            if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    295                errorCode=U_ZERO_ERROR;
    296                capacity=(3*src.getCapacity())/2; // increase capacity by 50%
    297            } else {
    298                break;
    299            }
    300        }
    301 
    302        if(U_FAILURE(errorCode)) {
    303            break; // conversion error
    304        }
    305 
    306        if(flush) {
    307            break; // completely converted the file
    308        }
    309 
    310        // read next block
    311        bytesLength = T_FileStream_read(f, bytes, static_cast<int32_t>(sizeof(bytes)));
    312        if(bytesLength==0) {
    313            // reached end of file, convert once more to flush the converter
    314            flush=true;
    315        }
    316    }
    317 
    318 exit:
    319    ucnv_close(cnv);
    320    T_FileStream_close(f);
    321 
    322    if(U_SUCCESS(errorCode)) {
    323        return parse(src, errorCode);
    324    } else {
    325        return nullptr;
    326    }
    327 }
    328 
    329 UXMLElement *
    330 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
    331    if(U_FAILURE(status)) {
    332        return nullptr;
    333    }
    334 
    335    UXMLElement   *root = nullptr;
    336    fPos = 0; // TODO use just a local pos variable and pass it into functions
    337              // where necessary?
    338 
    339    // set all matchers to work on the input string
    340    mXMLDecl.reset(src);
    341    mXMLComment.reset(src);
    342    mXMLSP.reset(src);
    343    mXMLDoctype.reset(src);
    344    mXMLPI.reset(src);
    345    mXMLElemStart.reset(src);
    346    mXMLElemEnd.reset(src);
    347    mXMLElemEmpty.reset(src);
    348    mXMLCharData.reset(src);
    349    mAttrValue.reset(src);
    350    mAttrNormalizer.reset(src);
    351    mNewLineNormalizer.reset(src);
    352    mAmps.reset(src);
    353 
    354    // Consume the XML Declaration, if present.
    355    if (mXMLDecl.lookingAt(fPos, status)) {
    356        fPos = mXMLDecl.end(status);
    357    }
    358 
    359    // Consume "misc" [XML production 27] appearing before DocType
    360    parseMisc(status);
    361 
    362    // Consume a DocType declaration, if present.
    363    if (mXMLDoctype.lookingAt(fPos, status)) {
    364        fPos = mXMLDoctype.end(status);
    365    }
    366 
    367    // Consume additional "misc" [XML production 27] appearing after the DocType
    368    parseMisc(status);
    369 
    370    // Get the root element
    371    if (mXMLElemEmpty.lookingAt(fPos, status)) {
    372        // Root is an empty element (no nested elements or content)
    373        root = createElement(mXMLElemEmpty, status);
    374        fPos = mXMLElemEmpty.end(status);
    375    } else {
    376        if (mXMLElemStart.lookingAt(fPos, status) == false) {
    377            error("Root Element expected", status);
    378            goto errorExit;
    379        }
    380        root = createElement(mXMLElemStart, status);
    381        UXMLElement  *el = root;
    382 
    383        //
    384        // This is the loop that consumes the root element of the document,
    385        //      including all nested content.   Nested elements are handled by
    386        //      explicit pushes/pops of the element stack; there is no recursion
    387        //      in the control flow of this code.
    388        //      "el" always refers to the current element, the one to which content
    389        //      is being added.  It is above the top of the element stack.
    390        for (;;) {
    391            // Nested Element Start
    392            if (mXMLElemStart.lookingAt(fPos, status)) {
    393                UXMLElement *t = createElement(mXMLElemStart, status);
    394                el->fChildren.addElement(t, status);
    395                t->fParent = el;
    396                fElementStack.push(el, status);
    397                el = t;
    398                continue;
    399            }
    400 
    401            // Text Content.  String is concatenated onto the current node's content,
    402            //                but only if it contains something other than spaces.
    403            UnicodeString s = scanContent(status);
    404            if (s.length() > 0) {
    405                mXMLSP.reset(s);
    406                if (mXMLSP.matches(status) == false) {
    407                    // This chunk of text contains something other than just
    408                    //  white space. Make a child node for it.
    409                    replaceCharRefs(s, status);
    410                    el->fChildren.addElement(s.clone(), status);
    411                }
    412                mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
    413                continue;
    414            }
    415 
    416            // Comments.  Discard.
    417            if (mXMLComment.lookingAt(fPos, status)) {
    418                fPos = mXMLComment.end(status);
    419                continue;
    420            }
    421 
    422            // PIs.  Discard.
    423            if (mXMLPI.lookingAt(fPos, status)) {
    424                fPos = mXMLPI.end(status);
    425                continue;
    426            }
    427 
    428            // Element End
    429            if (mXMLElemEnd.lookingAt(fPos, status)) {
    430                fPos = mXMLElemEnd.end(0, status);
    431                const UnicodeString name = mXMLElemEnd.group(1, status);
    432                if (name != *el->fName) {
    433                    error("Element start / end tag mismatch", status);
    434                    goto errorExit;
    435                }
    436                if (fElementStack.empty()) {
    437                    // Close of the root element.  We're done with the doc.
    438                    el = nullptr;
    439                    break;
    440                }
    441                el = static_cast<UXMLElement*>(fElementStack.pop());
    442                continue;
    443            }
    444 
    445            // Empty Element.  Stored as a child of the current element, but not stacked.
    446            if (mXMLElemEmpty.lookingAt(fPos, status)) {
    447                UXMLElement *t = createElement(mXMLElemEmpty, status);
    448                el->fChildren.addElement(t, status);
    449                continue;
    450            }
    451 
    452            // Hit something within the document that doesn't match anything.
    453            //   It's an error.
    454            error("Unrecognized markup", status);
    455            break;
    456        }
    457 
    458        if (el != nullptr || !fElementStack.empty()) {
    459            // We bailed out early, for some reason.
    460            error("Root element not closed.", status);
    461            goto errorExit;
    462        }
    463    }
    464 
    465    // Root Element parse is complete.
    466    // Consume the annoying xml "Misc" that can appear at the end of the doc.
    467    parseMisc(status);
    468 
    469    // We should have reached the end of the input
    470    if (fPos != src.length()) {
    471        error("Extra content at the end of the document", status);
    472        goto errorExit;
    473    }
    474 
    475    // Success!
    476    return root;
    477 
    478 errorExit:
    479    delete root;
    480    return nullptr;
    481 }
    482 
    483 //
    484 //  createElement
    485 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
    486 //      for it.
    487 //
    488 UXMLElement *
    489 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
    490    // First capture group is the element's name.
    491    UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
    492 
    493    // Scan for attributes.
    494    int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
    495 
    496    while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
    497        UnicodeString attName  = mAttrValue.group(1, status);
    498        UnicodeString attValue = mAttrValue.group(2, status);
    499 
    500        // Trim the quotes from the att value.  These are left over from the original regex
    501        //   that parsed the attribute, which couldn't conveniently strip them.
    502        attValue.remove(0,1);                    // one char from the beginning
    503        attValue.truncate(attValue.length()-1);  // and one from the end.
    504        
    505        // XML Attribute value normalization. 
    506        // This is one of the really screwy parts of the XML spec.
    507        // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
    508        // Note that non-validating parsers must treat all entities as type CDATA
    509        //   which simplifies things some.
    510 
    511        // Att normalization step 1:  normalize any newlines in the attribute value
    512        mNewLineNormalizer.reset(attValue);
    513        attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
    514 
    515        // Next change all xml white space chars to plain \u0020 spaces.
    516        mAttrNormalizer.reset(attValue);
    517        UnicodeString oneSpace(static_cast<char16_t>(0x0020));
    518        attValue = mAttrNormalizer.replaceAll(oneSpace, status);
    519 
    520        // Replace character entities.
    521        replaceCharRefs(attValue, status);
    522 
    523        // Save the attribute name and value in our document structure.
    524        el->fAttNames.addElement((void *)intern(attName, status), status);
    525        el->fAttValues.addElement(attValue.clone(), status);
    526        pos = mAttrValue.end(2, status);
    527    }
    528    fPos = mEl.end(0, status);
    529    return el;
    530 }
    531 
    532 //
    533 //  parseMisc
    534 //     Consume XML "Misc" [production #27]
    535 //        which is any combination of space, PI and comments
    536 //      Need to watch end-of-input because xml MISC stuff is allowed after
    537 //        the document element, so we WILL scan off the end in this function
    538 //
    539 void
    540 UXMLParser::parseMisc(UErrorCode &status)  {
    541    for (;;) {
    542        if (fPos >= mXMLPI.input().length()) {
    543            break;
    544        }
    545        if (mXMLPI.lookingAt(fPos, status)) {
    546            fPos = mXMLPI.end(status);
    547            continue;
    548        }
    549        if (mXMLSP.lookingAt(fPos, status)) {
    550            fPos = mXMLSP.end(status);
    551            continue;
    552        }
    553        if (mXMLComment.lookingAt(fPos, status)) {
    554            fPos = mXMLComment.end(status);
    555            continue;
    556        }
    557        break;
    558    }
    559 }
    560 
    561 //
    562 //  Scan for document content.
    563 //
    564 UnicodeString
    565 UXMLParser::scanContent(UErrorCode &status) {
    566    UnicodeString  result;
    567    if (mXMLCharData.lookingAt(fPos, status)) {
    568        result = mXMLCharData.group(static_cast<int32_t>(0), status);
    569        // Normalize the new-lines.  (Before char ref substitution)
    570        mNewLineNormalizer.reset(result);
    571        result = mNewLineNormalizer.replaceAll(fOneLF, status);
    572        
    573        // TODO:  handle CDATA
    574        fPos = mXMLCharData.end(0, status);
    575    }
    576 
    577    return result;
    578 }
    579 
    580 //
    581 //   replaceCharRefs
    582 //
    583 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
    584 //       with the corresponding actual character.
    585 //
    586 void
    587 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
    588    UnicodeString result;
    589    UnicodeString replacement;
    590    int     i;
    591 
    592    mAmps.reset(s);
    593    // See the initialization for the regex matcher mAmps.
    594    //    Which entity we've matched is determined by which capture group has content,
    595    //      which is flagged by start() of that group not being -1.
    596    while (mAmps.find()) {
    597        if (mAmps.start(1, status) != -1) {
    598            replacement.setTo(static_cast<char16_t>(x_AMP));
    599        } else if (mAmps.start(2, status) != -1) {
    600            replacement.setTo(static_cast<char16_t>(x_LT));
    601        } else if (mAmps.start(3, status) != -1) {
    602            replacement.setTo(static_cast<char16_t>(x_GT));
    603        } else if (mAmps.start(4, status) != -1) {
    604            replacement.setTo(static_cast<char16_t>(x_APOS));
    605        } else if (mAmps.start(5, status) != -1) {
    606            replacement.setTo(static_cast<char16_t>(x_QUOT));
    607        } else if (mAmps.start(6, status) != -1) {
    608            UnicodeString hexString = mAmps.group(6, status);
    609            UChar32 val = 0;
    610            for (i=0; i<hexString.length(); i++) {
    611                val = (val << 4) + u_digit(hexString.charAt(i), 16);
    612            }
    613            // TODO:  some verification that the character is valid
    614            replacement.setTo(val);
    615        } else if (mAmps.start(7, status) != -1) {
    616            UnicodeString decimalString = mAmps.group(7, status);
    617            UChar32 val = 0;
    618            for (i=0; i<decimalString.length(); i++) {
    619                val = val*10 + u_digit(decimalString.charAt(i), 10);
    620            }
    621            // TODO:  some verification that the character is valid
    622            replacement.setTo(val);
    623        } else {
    624            // An unrecognized &entity;  Leave it alone.
    625            //  TODO:  check that it really looks like an entity, and is not some
    626            //         random & in the text.
    627            replacement = mAmps.group(static_cast<int32_t>(0), status);
    628        }
    629        mAmps.appendReplacement(result, replacement, status);
    630    }
    631    mAmps.appendTail(result);
    632    s = result;
    633 }
    634 
    635 void
    636 UXMLParser::error(const char *message, UErrorCode &status) {
    637    // TODO:  something better here...
    638    const UnicodeString &src=mXMLDecl.input();
    639    int  line = 0;
    640    int  ci = 0;
    641    while (ci < fPos && ci>=0) {
    642        ci = src.indexOf(static_cast<char16_t>(0x0a), ci + 1);
    643        line++;
    644    }
    645    fprintf(stderr, "Error: %s at line %d\n", message, line);
    646    if (U_SUCCESS(status)) {
    647        status = U_PARSE_ERROR;
    648    }
    649 }
    650 
    651 // intern strings like in Java
    652 
    653 const UnicodeString *
    654 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
    655    const UHashElement *he=fNames.find(s);
    656    if(he!=nullptr) {
    657        // already a known name, return its hashed key pointer
    658        return static_cast<const UnicodeString*>(he->key.pointer);
    659    } else {
    660        // add this new name and return its hashed key pointer
    661        fNames.puti(s, 1, errorCode);
    662        he=fNames.find(s);
    663        return static_cast<const UnicodeString*>(he->key.pointer);
    664    }
    665 }
    666 
    667 const UnicodeString *
    668 UXMLParser::findName(const UnicodeString &s) const {
    669    const UHashElement *he=fNames.find(s);
    670    if(he!=nullptr) {
    671        // a known name, return its hashed key pointer
    672        return static_cast<const UnicodeString*>(he->key.pointer);
    673    } else {
    674        // unknown name
    675        return nullptr;
    676    }
    677 }
    678 
    679 // UXMLElement ------------------------------------------------------------- ***
    680 
    681 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
    682   fParser(parser),
    683   fName(name),
    684   fAttNames(errorCode),
    685   fAttValues(errorCode),
    686   fChildren(errorCode),
    687   fParent(nullptr)
    688 {
    689 }
    690 
    691 UXMLElement::~UXMLElement() {
    692    int   i;
    693    // attribute names are owned by the UXMLParser, don't delete them here
    694    for (i=fAttValues.size()-1; i>=0; i--) {
    695        delete static_cast<UObject*>(fAttValues.elementAt(i));
    696    }
    697    for (i=fChildren.size()-1; i>=0; i--) {
    698        delete static_cast<UObject*>(fChildren.elementAt(i));
    699    }
    700 }
    701 
    702 const UnicodeString &
    703 UXMLElement::getTagName() const {
    704    return *fName;
    705 }
    706 
    707 UnicodeString
    708 UXMLElement::getText(UBool recurse) const {
    709    UnicodeString text;
    710    appendText(text, recurse);
    711    return text;
    712 }
    713 
    714 void
    715 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
    716    const UObject *node;
    717    int32_t i, count=fChildren.size();
    718    for(i=0; i<count; ++i) {
    719        node = static_cast<const UObject*>(fChildren.elementAt(i));
    720        const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
    721        if(s!=nullptr) {
    722            text.append(*s);
    723        } else if(recurse) /* must be a UXMLElement */ {
    724            ((const UXMLElement *)node)->appendText(text, recurse);
    725        }
    726    }
    727 }
    728 
    729 int32_t
    730 UXMLElement::countAttributes() const {
    731    return fAttNames.size();
    732 }
    733 
    734 const UnicodeString *
    735 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
    736    if(0<=i && i<fAttNames.size()) {
    737        name.setTo(*static_cast<const UnicodeString*>(fAttNames.elementAt(i)));
    738        value.setTo(*static_cast<const UnicodeString*>(fAttValues.elementAt(i)));
    739        return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
    740    } else {
    741        return nullptr;
    742    }
    743 }
    744 
    745 const UnicodeString *
    746 UXMLElement::getAttribute(const UnicodeString &name) const {
    747    // search for the attribute name by comparing the interned pointer,
    748    // not the string contents
    749    const UnicodeString *p=fParser->findName(name);
    750    if(p==nullptr) {
    751        return nullptr; // no such attribute seen by the parser at all
    752    }
    753 
    754    int32_t i, count=fAttNames.size();
    755    for(i=0; i<count; ++i) {
    756        if (p == static_cast<const UnicodeString*>(fAttNames.elementAt(i))) {
    757            return static_cast<const UnicodeString*>(fAttValues.elementAt(i));
    758        }
    759    }
    760    return nullptr;
    761 }
    762 
    763 int32_t
    764 UXMLElement::countChildren() const {
    765    return fChildren.size();
    766 }
    767 
    768 const UObject *
    769 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
    770    if(0<=i && i<fChildren.size()) {
    771        const UObject* node = static_cast<const UObject*>(fChildren.elementAt(i));
    772        if(dynamic_cast<const UXMLElement *>(node)!=nullptr) {
    773            type=UXML_NODE_TYPE_ELEMENT;
    774        } else {
    775            type=UXML_NODE_TYPE_STRING;
    776        }
    777        return node;
    778    } else {
    779        return nullptr;
    780    }
    781 }
    782 
    783 const UXMLElement *
    784 UXMLElement::nextChildElement(int32_t &i) const {
    785    if(i<0) {
    786        return nullptr;
    787    }
    788 
    789    const UObject *node;
    790    int32_t count=fChildren.size();
    791    while(i<count) {
    792        node = static_cast<const UObject*>(fChildren.elementAt(i++));
    793        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
    794        if(elem!=nullptr) {
    795            return elem;
    796        }
    797    }
    798    return nullptr;
    799 }
    800 
    801 const UXMLElement *
    802 UXMLElement::getChildElement(const UnicodeString &name) const {
    803    // search for the element name by comparing the interned pointer,
    804    // not the string contents
    805    const UnicodeString *p=fParser->findName(name);
    806    if(p==nullptr) {
    807        return nullptr; // no such element seen by the parser at all
    808    }
    809 
    810    const UObject *node;
    811    int32_t i, count=fChildren.size();
    812    for(i=0; i<count; ++i) {
    813        node = static_cast<const UObject*>(fChildren.elementAt(i));
    814        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
    815        if(elem!=nullptr) {
    816            if(p==elem->fName) {
    817                return elem;
    818            }
    819        }
    820    }
    821    return nullptr;
    822 }
    823 
    824 U_NAMESPACE_END
    825 
    826 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */