tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

entities.py (2707B)


      1 import json
      2 
      3 import html5lib
      4 
      5 
      6 def parse(path="html5ents.xml"):
      7    return html5lib.parse(open(path), treebuilder="lxml")
      8 
      9 
     10 def entity_table(tree):
     11    return {entity_name("".join(tr[0].xpath(".//text()"))):
     12            entity_characters(tr[1].text)
     13            for tr in tree.xpath("//h:tbody/h:tr",
     14                                 namespaces={"h": "http://www.w3.org/1999/xhtml"})}
     15 
     16 
     17 def entity_name(inp):
     18    return inp.strip()
     19 
     20 
     21 def entity_characters(inp):
     22    return "".join(codepoint_to_character(item)
     23                   for item in inp.split()
     24                   if item)
     25 
     26 
     27 def codepoint_to_character(inp):
     28    return ("\\U000" + inp[2:]).decode("unicode-escape")
     29 
     30 
     31 def make_tests_json(entities):
     32    test_list = make_test_list(entities)
     33    tests_json = {"tests":
     34                  [make_test(*item) for item in test_list]
     35                  }
     36    return tests_json
     37 
     38 
     39 def make_test(name, characters, good):
     40    return {
     41        "description": test_description(name, good),
     42        "input": "&%s" % name,
     43        "output": test_expected(name, characters, good)
     44    }
     45 
     46 
     47 def test_description(name, good):
     48    with_semicolon = name.endswith(";")
     49    semicolon_text = {True: "with a semi-colon",
     50                      False: "without a semi-colon"}[with_semicolon]
     51    if good:
     52        text = "Named entity: %s %s" % (name, semicolon_text)
     53    else:
     54        text = "Bad named entity: %s %s" % (name, semicolon_text)
     55    return text
     56 
     57 
     58 def test_expected(name, characters, good):
     59    rv = []
     60    if not good or not name.endswith(";"):
     61        rv.append("ParseError")
     62    rv.append(["Character", characters])
     63    return rv
     64 
     65 
     66 def make_test_list(entities):
     67    tests = []
     68    for entity_name, characters in entities.items():
     69        if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
     70            tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
     71        tests.append((entity_name, characters, True))
     72    return sorted(tests)
     73 
     74 
     75 def subentity_exists(entity_name, entities):
     76    for i in range(1, len(entity_name)):
     77        if entity_name[:-i] in entities:
     78            return True
     79    return False
     80 
     81 
     82 def make_entities_code(entities):
     83    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
     84        name, entities[name].encode(
     85            "unicode-escape").replace("\"", "\\\""))
     86        for name in sorted(entities.keys()))
     87    return """entities = {
     88 %s
     89 }""" % entities_text
     90 
     91 
     92 def main():
     93    entities = entity_table(parse())
     94    tests_json = make_tests_json(entities)
     95    json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
     96    code = make_entities_code(entities)
     97    open("entities_constants.py", "w").write(code)
     98 
     99 
    100 if __name__ == "__main__":
    101    main()