tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

parse.py (8239B)


      1 #!/usr/bin/env python
      2 """
      3 Parse a document to a tree, with optional profiling
      4 """
      5 
      6 import argparse
      7 import sys
      8 import traceback
      9 
     10 from html5lib import html5parser
     11 from html5lib import treebuilders, serializer, treewalkers
     12 from html5lib import constants
     13 from html5lib import _utils
     14 
     15 
     16 def parse():
     17    parser = get_parser()
     18    opts = parser.parse_args()
     19    encoding = "utf8"
     20 
     21    try:
     22        f = opts.filename
     23        # Try opening from the internet
     24        if f.startswith('http://'):
     25            try:
     26                import urllib.request
     27                import urllib.parse
     28                import urllib.error
     29                import cgi
     30                f = urllib.request.urlopen(f)
     31                contentType = f.headers.get('content-type')
     32                if contentType:
     33                    (mediaType, params) = cgi.parse_header(contentType)
     34                    encoding = params.get('charset')
     35            except Exception:
     36                pass
     37        elif f == '-':
     38            f = sys.stdin
     39            if sys.version_info[0] >= 3:
     40                encoding = None
     41        else:
     42            try:
     43                # Try opening from file system
     44                f = open(f, "rb")
     45            except IOError as e:
     46                sys.stderr.write("Unable to open file: %s\n" % e)
     47                sys.exit(1)
     48    except IndexError:
     49        sys.stderr.write("No filename provided. Use -h for help\n")
     50        sys.exit(1)
     51 
     52    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
     53 
     54    p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log)
     55 
     56    if opts.fragment:
     57        parseMethod = p.parseFragment
     58    else:
     59        parseMethod = p.parse
     60 
     61    if opts.profile:
     62        import cProfile
     63        import pstats
     64        cProfile.runctx("run(parseMethod, f, encoding, scripting)", None,
     65                        {"run": run,
     66                         "parseMethod": parseMethod,
     67                         "f": f,
     68                         "encoding": encoding,
     69                         "scripting": opts.scripting},
     70                        "stats.prof")
     71        # XXX - We should use a temp file here
     72        stats = pstats.Stats('stats.prof')
     73        stats.strip_dirs()
     74        stats.sort_stats('time')
     75        stats.print_stats()
     76    elif opts.time:
     77        import time
     78        t0 = time.time()
     79        document = run(parseMethod, f, encoding, opts.scripting)
     80        t1 = time.time()
     81        if document:
     82            printOutput(p, document, opts)
     83            t2 = time.time()
     84            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
     85        else:
     86            sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
     87    else:
     88        document = run(parseMethod, f, encoding, opts.scripting)
     89        if document:
     90            printOutput(p, document, opts)
     91 
     92 
     93 def run(parseMethod, f, encoding, scripting):
     94    try:
     95        document = parseMethod(f, override_encoding=encoding, scripting=scripting)
     96    except Exception:
     97        document = None
     98        traceback.print_exc()
     99    return document
    100 
    101 
    102 def printOutput(parser, document, opts):
    103    if opts.encoding:
    104        print("Encoding:", parser.tokenizer.stream.charEncoding)
    105 
    106    for item in parser.log:
    107        print(item)
    108 
    109    if document is not None:
    110        if opts.xml:
    111            tb = opts.treebuilder.lower()
    112            if tb == "dom":
    113                document.writexml(sys.stdout, encoding="utf-8")
    114            elif tb == "lxml":
    115                import lxml.etree
    116                sys.stdout.write(lxml.etree.tostring(document, encoding="unicode"))
    117            elif tb == "etree":
    118                sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode"))
    119        elif opts.tree:
    120            if not hasattr(document, '__getitem__'):
    121                document = [document]
    122            for fragment in document:
    123                print(parser.tree.testSerializer(fragment))
    124        elif opts.html:
    125            kwargs = {}
    126            for opt in serializer.HTMLSerializer.options:
    127                try:
    128                    kwargs[opt] = getattr(opts, opt)
    129                except Exception:
    130                    pass
    131            if not kwargs['quote_char']:
    132                del kwargs['quote_char']
    133 
    134            if opts.sanitize:
    135                kwargs["sanitize"] = True
    136 
    137            tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
    138            if sys.version_info[0] >= 3:
    139                encoding = None
    140            else:
    141                encoding = "utf-8"
    142            for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
    143                sys.stdout.write(text)
    144            if not text.endswith('\n'):
    145                sys.stdout.write('\n')
    146    if opts.error:
    147        errList = []
    148        for pos, errorcode, datavars in parser.errors:
    149            errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
    150        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
    151 
    152 
    153 def get_parser():
    154    parser = argparse.ArgumentParser(description=__doc__)
    155 
    156    parser.add_argument("-p", "--profile", action="store_true",
    157                        help="Use the hotshot profiler to "
    158                        "produce a detailed log of the run")
    159 
    160    parser.add_argument("-t", "--time",
    161                        action="store_true",
    162                        help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
    163 
    164    parser.add_argument("-b", "--treebuilder",
    165                        default="etree")
    166 
    167    parser.add_argument("-e", "--error", action="store_true",
    168                        help="Print a list of parse errors")
    169 
    170    parser.add_argument("-f", "--fragment", action="store_true",
    171                        help="Parse as a fragment")
    172 
    173    parser.add_argument("-s", "--scripting", action="store_true",
    174                        help="Handle noscript tags as if scripting was enabled")
    175 
    176    parser.add_argument("--tree", action="store_true",
    177                        help="Output as debug tree")
    178 
    179    parser.add_argument("-x", "--xml", action="store_true",
    180                        help="Output as xml")
    181 
    182    parser.add_argument("--no-html", action="store_false",
    183                        dest="html", help="Don't output html")
    184 
    185    parser.add_argument("-c", "--encoding", action="store_true",
    186                        help="Print character encoding used")
    187 
    188    parser.add_argument("--inject-meta-charset", action="store_true",
    189                        help="inject <meta charset>")
    190 
    191    parser.add_argument("--strip-whitespace", action="store_true",
    192                        help="strip whitespace")
    193 
    194    parser.add_argument("--omit-optional-tags", action="store_true",
    195                        help="omit optional tags")
    196 
    197    parser.add_argument("--quote-attr-values", action="store_true",
    198                        help="quote attribute values")
    199 
    200    parser.add_argument("--use-best-quote-char", action="store_true",
    201                        help="use best quote character")
    202 
    203    parser.add_argument("--quote-char",
    204                        help="quote character")
    205 
    206    parser.add_argument("--no-minimize-boolean-attributes",
    207                        action="store_false",
    208                        dest="minimize_boolean_attributes",
    209                        help="minimize boolean attributes")
    210 
    211    parser.add_argument("--use-trailing-solidus", action="store_true",
    212                        help="use trailing solidus")
    213 
    214    parser.add_argument("--space-before-trailing-solidus",
    215                        action="store_true",
    216                        help="add space before trailing solidus")
    217 
    218    parser.add_argument("--escape-lt-in-attrs", action="store_true",
    219                        help="escape less than signs in attribute values")
    220 
    221    parser.add_argument("--escape-rcdata", action="store_true",
    222                        help="escape rcdata element values")
    223 
    224    parser.add_argument("--sanitize", action="store_true",
    225                        help="sanitize")
    226 
    227    parser.add_argument("-l", "--log", action="store_true",
    228                        help="log state transitions")
    229 
    230    parser.add_argument("filename")
    231 
    232    return parser
    233 
    234 
    235 if __name__ == "__main__":
    236    parse()