tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

XMLParser.py (5030B)


      1 from collections import OrderedDict
      2 from typing import Dict, List, Optional, Text, Union
      3 from os.path import dirname, join
      4 from xml.parsers import expat
      5 import xml.etree.ElementTree as etree  # noqa: N813
      6 
      7 
      8 _catalog = join(dirname(__file__), "catalog")
      9 
     10 def _wrap_error(e: expat.error) -> etree.ParseError:
     11    err = etree.ParseError(e)
     12    err.code = e.code
     13    err.position = e.lineno, e.offset
     14    raise err
     15 
     16 _names: Dict[Text, Text] = {}
     17 def _fixname(key: Text) -> Text:
     18    try:
     19        name = _names[key]
     20    except KeyError:
     21        name = key
     22        if "}" in name:
     23            name = "{" + name
     24        _names[key] = name
     25    return name
     26 
     27 
     28 _undefined_entity_code: int = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY]
     29 
     30 
     31 class XMLParser:
     32    """
     33    An XML parser with support for XHTML DTDs and all Python-supported encodings
     34 
     35    This implements the API defined by
     36    xml.etree.ElementTree.XMLParser, but supports XHTML DTDs
     37    (therefore allowing XHTML entities) and supports all encodings
     38    Python does, rather than just those supported by expat.
     39    """
     40    def __init__(self, encoding: Optional[Text] = None) -> None:
     41        self._parser = expat.ParserCreate(encoding, "}")
     42        self._target = etree.TreeBuilder()
     43        # parser settings
     44        self._parser.buffer_text = True
     45        self._parser.ordered_attributes = True
     46        self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
     47        # parser callbacks
     48        self._parser.XmlDeclHandler = self._xml_decl
     49        self._parser.StartElementHandler = self._start
     50        self._parser.EndElementHandler = self._end
     51        self._parser.CharacterDataHandler = self._data
     52        self._parser.ExternalEntityRefHandler = self._external
     53        self._parser.SkippedEntityHandler = self._skipped
     54        # used for our horrible re-encoding hack
     55        self._fed_data: Optional[List[bytes]] = []
     56        self._read_encoding: Optional[Text] = None
     57 
     58    def _xml_decl(self, version: Text, encoding: Optional[Text], standalone: int) -> None:
     59        self._read_encoding = encoding
     60 
     61    def _start(self, tag: Text, attrib_in: List[str]) -> etree.Element:
     62        assert isinstance(tag, str)
     63        self._fed_data = None
     64        tag = _fixname(tag)
     65        attrib: Dict[Union[bytes, Text], Union[bytes, Text]] = OrderedDict()
     66        if attrib_in:
     67            for i in range(0, len(attrib_in), 2):
     68                attrib[_fixname(attrib_in[i])] = attrib_in[i+1]
     69        return self._target.start(tag, attrib)
     70 
     71    def _data(self, text: Text) -> None:
     72        self._target.data(text)
     73 
     74    def _end(self, tag: Text) -> etree.Element:
     75        return self._target.end(_fixname(tag))
     76 
     77    def _external(self, context: Text, base: Optional[Text], system_id: Optional[Text], public_id: Optional[Text]) -> bool:
     78        if public_id in {
     79                "-//W3C//DTD XHTML 1.0 Transitional//EN",
     80                "-//W3C//DTD XHTML 1.1//EN",
     81                "-//W3C//DTD XHTML 1.0 Strict//EN",
     82                "-//W3C//DTD XHTML 1.0 Frameset//EN",
     83                "-//W3C//DTD XHTML Basic 1.0//EN",
     84                "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN",
     85                "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN",
     86                "-//W3C//DTD MathML 2.0//EN",
     87                "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
     88        }:
     89            parser = self._parser.ExternalEntityParserCreate(context)
     90            with open(join(_catalog, "xhtml.dtd"), "rb") as fp:
     91                try:
     92                    parser.ParseFile(fp)
     93                except expat.error:
     94                    return False
     95 
     96        return True
     97 
     98    def _skipped(self, name: Text, is_parameter_entity: bool) -> None:
     99        err = expat.error("undefined entity %s: line %d, column %d" %
    100                          (name, self._parser.ErrorLineNumber,
    101                           self._parser.ErrorColumnNumber))
    102        err.code = _undefined_entity_code
    103        err.lineno = self._parser.ErrorLineNumber
    104        err.offset = self._parser.ErrorColumnNumber
    105        raise err
    106 
    107    def feed(self, data: bytes) -> None:
    108        if self._fed_data is not None:
    109            self._fed_data.append(data)
    110        try:
    111            self._parser.Parse(data, False)
    112        except expat.error as v:
    113            _wrap_error(v)
    114        except ValueError as e:
    115            if e.args[0] == 'multi-byte encodings are not supported':
    116                assert self._read_encoding is not None
    117                assert self._fed_data is not None
    118                xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8")
    119                new_parser = XMLParser("utf-8")
    120                self._parser = new_parser._parser
    121                self._target = new_parser._target
    122                self._fed_data = None
    123                self.feed(xml)
    124 
    125    def close(self) -> etree.Element:
    126        try:
    127            self._parser.Parse("", True)
    128        except expat.error as v:
    129            _wrap_error(v)
    130        tree = self._target.close()
    131        return tree