XMLParser.py (5030B)
1 from collections import OrderedDict 2 from typing import Dict, List, Optional, Text, Union 3 from os.path import dirname, join 4 from xml.parsers import expat 5 import xml.etree.ElementTree as etree # noqa: N813 6 7 8 _catalog = join(dirname(__file__), "catalog") 9 10 def _wrap_error(e: expat.error) -> etree.ParseError: 11 err = etree.ParseError(e) 12 err.code = e.code 13 err.position = e.lineno, e.offset 14 raise err 15 16 _names: Dict[Text, Text] = {} 17 def _fixname(key: Text) -> Text: 18 try: 19 name = _names[key] 20 except KeyError: 21 name = key 22 if "}" in name: 23 name = "{" + name 24 _names[key] = name 25 return name 26 27 28 _undefined_entity_code: int = expat.errors.codes[expat.errors.XML_ERROR_UNDEFINED_ENTITY] 29 30 31 class XMLParser: 32 """ 33 An XML parser with support for XHTML DTDs and all Python-supported encodings 34 35 This implements the API defined by 36 xml.etree.ElementTree.XMLParser, but supports XHTML DTDs 37 (therefore allowing XHTML entities) and supports all encodings 38 Python does, rather than just those supported by expat. 39 """ 40 def __init__(self, encoding: Optional[Text] = None) -> None: 41 self._parser = expat.ParserCreate(encoding, "}") 42 self._target = etree.TreeBuilder() 43 # parser settings 44 self._parser.buffer_text = True 45 self._parser.ordered_attributes = True 46 self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) 47 # parser callbacks 48 self._parser.XmlDeclHandler = self._xml_decl 49 self._parser.StartElementHandler = self._start 50 self._parser.EndElementHandler = self._end 51 self._parser.CharacterDataHandler = self._data 52 self._parser.ExternalEntityRefHandler = self._external 53 self._parser.SkippedEntityHandler = self._skipped 54 # used for our horrible re-encoding hack 55 self._fed_data: Optional[List[bytes]] = [] 56 self._read_encoding: Optional[Text] = None 57 58 def _xml_decl(self, version: Text, encoding: Optional[Text], standalone: int) -> None: 59 self._read_encoding = encoding 60 61 def _start(self, tag: Text, attrib_in: List[str]) -> etree.Element: 62 assert isinstance(tag, str) 63 self._fed_data = None 64 tag = _fixname(tag) 65 attrib: Dict[Union[bytes, Text], Union[bytes, Text]] = OrderedDict() 66 if attrib_in: 67 for i in range(0, len(attrib_in), 2): 68 attrib[_fixname(attrib_in[i])] = attrib_in[i+1] 69 return self._target.start(tag, attrib) 70 71 def _data(self, text: Text) -> None: 72 self._target.data(text) 73 74 def _end(self, tag: Text) -> etree.Element: 75 return self._target.end(_fixname(tag)) 76 77 def _external(self, context: Text, base: Optional[Text], system_id: Optional[Text], public_id: Optional[Text]) -> bool: 78 if public_id in { 79 "-//W3C//DTD XHTML 1.0 Transitional//EN", 80 "-//W3C//DTD XHTML 1.1//EN", 81 "-//W3C//DTD XHTML 1.0 Strict//EN", 82 "-//W3C//DTD XHTML 1.0 Frameset//EN", 83 "-//W3C//DTD XHTML Basic 1.0//EN", 84 "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN", 85 "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN", 86 "-//W3C//DTD MathML 2.0//EN", 87 "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" 88 }: 89 parser = self._parser.ExternalEntityParserCreate(context) 90 with open(join(_catalog, "xhtml.dtd"), "rb") as fp: 91 try: 92 parser.ParseFile(fp) 93 except expat.error: 94 return False 95 96 return True 97 98 def _skipped(self, name: Text, is_parameter_entity: bool) -> None: 99 err = expat.error("undefined entity %s: line %d, column %d" % 100 (name, self._parser.ErrorLineNumber, 101 self._parser.ErrorColumnNumber)) 102 err.code = _undefined_entity_code 103 err.lineno = self._parser.ErrorLineNumber 104 err.offset = self._parser.ErrorColumnNumber 105 raise err 106 107 def feed(self, data: bytes) -> None: 108 if self._fed_data is not None: 109 self._fed_data.append(data) 110 try: 111 self._parser.Parse(data, False) 112 except expat.error as v: 113 _wrap_error(v) 114 except ValueError as e: 115 if e.args[0] == 'multi-byte encodings are not supported': 116 assert self._read_encoding is not None 117 assert self._fed_data is not None 118 xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8") 119 new_parser = XMLParser("utf-8") 120 self._parser = new_parser._parser 121 self._target = new_parser._target 122 self._fed_data = None 123 self.feed(xml) 124 125 def close(self) -> etree.Element: 126 try: 127 self._parser.Parse("", True) 128 except expat.error as v: 129 _wrap_error(v) 130 tree = self._target.close() 131 return tree