parse.py (8239B)
1 #!/usr/bin/env python 2 """ 3 Parse a document to a tree, with optional profiling 4 """ 5 6 import argparse 7 import sys 8 import traceback 9 10 from html5lib import html5parser 11 from html5lib import treebuilders, serializer, treewalkers 12 from html5lib import constants 13 from html5lib import _utils 14 15 16 def parse(): 17 parser = get_parser() 18 opts = parser.parse_args() 19 encoding = "utf8" 20 21 try: 22 f = opts.filename 23 # Try opening from the internet 24 if f.startswith('http://'): 25 try: 26 import urllib.request 27 import urllib.parse 28 import urllib.error 29 import cgi 30 f = urllib.request.urlopen(f) 31 contentType = f.headers.get('content-type') 32 if contentType: 33 (mediaType, params) = cgi.parse_header(contentType) 34 encoding = params.get('charset') 35 except Exception: 36 pass 37 elif f == '-': 38 f = sys.stdin 39 if sys.version_info[0] >= 3: 40 encoding = None 41 else: 42 try: 43 # Try opening from file system 44 f = open(f, "rb") 45 except IOError as e: 46 sys.stderr.write("Unable to open file: %s\n" % e) 47 sys.exit(1) 48 except IndexError: 49 sys.stderr.write("No filename provided. Use -h for help\n") 50 sys.exit(1) 51 52 treebuilder = treebuilders.getTreeBuilder(opts.treebuilder) 53 54 p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log) 55 56 if opts.fragment: 57 parseMethod = p.parseFragment 58 else: 59 parseMethod = p.parse 60 61 if opts.profile: 62 import cProfile 63 import pstats 64 cProfile.runctx("run(parseMethod, f, encoding, scripting)", None, 65 {"run": run, 66 "parseMethod": parseMethod, 67 "f": f, 68 "encoding": encoding, 69 "scripting": opts.scripting}, 70 "stats.prof") 71 # XXX - We should use a temp file here 72 stats = pstats.Stats('stats.prof') 73 stats.strip_dirs() 74 stats.sort_stats('time') 75 stats.print_stats() 76 elif opts.time: 77 import time 78 t0 = time.time() 79 document = run(parseMethod, f, encoding, opts.scripting) 80 t1 = time.time() 81 if document: 82 printOutput(p, document, opts) 83 t2 = time.time() 84 sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) 85 else: 86 sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) 87 else: 88 document = run(parseMethod, f, encoding, opts.scripting) 89 if document: 90 printOutput(p, document, opts) 91 92 93 def run(parseMethod, f, encoding, scripting): 94 try: 95 document = parseMethod(f, override_encoding=encoding, scripting=scripting) 96 except Exception: 97 document = None 98 traceback.print_exc() 99 return document 100 101 102 def printOutput(parser, document, opts): 103 if opts.encoding: 104 print("Encoding:", parser.tokenizer.stream.charEncoding) 105 106 for item in parser.log: 107 print(item) 108 109 if document is not None: 110 if opts.xml: 111 tb = opts.treebuilder.lower() 112 if tb == "dom": 113 document.writexml(sys.stdout, encoding="utf-8") 114 elif tb == "lxml": 115 import lxml.etree 116 sys.stdout.write(lxml.etree.tostring(document, encoding="unicode")) 117 elif tb == "etree": 118 sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode")) 119 elif opts.tree: 120 if not hasattr(document, '__getitem__'): 121 document = [document] 122 for fragment in document: 123 print(parser.tree.testSerializer(fragment)) 124 elif opts.html: 125 kwargs = {} 126 for opt in serializer.HTMLSerializer.options: 127 try: 128 kwargs[opt] = getattr(opts, opt) 129 except Exception: 130 pass 131 if not kwargs['quote_char']: 132 del kwargs['quote_char'] 133 134 if opts.sanitize: 135 kwargs["sanitize"] = True 136 137 tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) 138 if sys.version_info[0] >= 3: 139 encoding = None 140 else: 141 encoding = "utf-8" 142 for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): 143 sys.stdout.write(text) 144 if not text.endswith('\n'): 145 sys.stdout.write('\n') 146 if opts.error: 147 errList = [] 148 for pos, errorcode, datavars in parser.errors: 149 errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) 150 sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n") 151 152 153 def get_parser(): 154 parser = argparse.ArgumentParser(description=__doc__) 155 156 parser.add_argument("-p", "--profile", action="store_true", 157 help="Use the hotshot profiler to " 158 "produce a detailed log of the run") 159 160 parser.add_argument("-t", "--time", 161 action="store_true", 162 help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)") 163 164 parser.add_argument("-b", "--treebuilder", 165 default="etree") 166 167 parser.add_argument("-e", "--error", action="store_true", 168 help="Print a list of parse errors") 169 170 parser.add_argument("-f", "--fragment", action="store_true", 171 help="Parse as a fragment") 172 173 parser.add_argument("-s", "--scripting", action="store_true", 174 help="Handle noscript tags as if scripting was enabled") 175 176 parser.add_argument("--tree", action="store_true", 177 help="Output as debug tree") 178 179 parser.add_argument("-x", "--xml", action="store_true", 180 help="Output as xml") 181 182 parser.add_argument("--no-html", action="store_false", 183 dest="html", help="Don't output html") 184 185 parser.add_argument("-c", "--encoding", action="store_true", 186 help="Print character encoding used") 187 188 parser.add_argument("--inject-meta-charset", action="store_true", 189 help="inject <meta charset>") 190 191 parser.add_argument("--strip-whitespace", action="store_true", 192 help="strip whitespace") 193 194 parser.add_argument("--omit-optional-tags", action="store_true", 195 help="omit optional tags") 196 197 parser.add_argument("--quote-attr-values", action="store_true", 198 help="quote attribute values") 199 200 parser.add_argument("--use-best-quote-char", action="store_true", 201 help="use best quote character") 202 203 parser.add_argument("--quote-char", 204 help="quote character") 205 206 parser.add_argument("--no-minimize-boolean-attributes", 207 action="store_false", 208 dest="minimize_boolean_attributes", 209 help="minimize boolean attributes") 210 211 parser.add_argument("--use-trailing-solidus", action="store_true", 212 help="use trailing solidus") 213 214 parser.add_argument("--space-before-trailing-solidus", 215 action="store_true", 216 help="add space before trailing solidus") 217 218 parser.add_argument("--escape-lt-in-attrs", action="store_true", 219 help="escape less than signs in attribute values") 220 221 parser.add_argument("--escape-rcdata", action="store_true", 222 help="escape rcdata element values") 223 224 parser.add_argument("--sanitize", action="store_true", 225 help="sanitize") 226 227 parser.add_argument("-l", "--log", action="store_true", 228 help="log state transitions") 229 230 parser.add_argument("filename") 231 232 return parser 233 234 235 if __name__ == "__main__": 236 parse()