entities.py (2707B)
1 import json 2 3 import html5lib 4 5 6 def parse(path="html5ents.xml"): 7 return html5lib.parse(open(path), treebuilder="lxml") 8 9 10 def entity_table(tree): 11 return {entity_name("".join(tr[0].xpath(".//text()"))): 12 entity_characters(tr[1].text) 13 for tr in tree.xpath("//h:tbody/h:tr", 14 namespaces={"h": "http://www.w3.org/1999/xhtml"})} 15 16 17 def entity_name(inp): 18 return inp.strip() 19 20 21 def entity_characters(inp): 22 return "".join(codepoint_to_character(item) 23 for item in inp.split() 24 if item) 25 26 27 def codepoint_to_character(inp): 28 return ("\\U000" + inp[2:]).decode("unicode-escape") 29 30 31 def make_tests_json(entities): 32 test_list = make_test_list(entities) 33 tests_json = {"tests": 34 [make_test(*item) for item in test_list] 35 } 36 return tests_json 37 38 39 def make_test(name, characters, good): 40 return { 41 "description": test_description(name, good), 42 "input": "&%s" % name, 43 "output": test_expected(name, characters, good) 44 } 45 46 47 def test_description(name, good): 48 with_semicolon = name.endswith(";") 49 semicolon_text = {True: "with a semi-colon", 50 False: "without a semi-colon"}[with_semicolon] 51 if good: 52 text = "Named entity: %s %s" % (name, semicolon_text) 53 else: 54 text = "Bad named entity: %s %s" % (name, semicolon_text) 55 return text 56 57 58 def test_expected(name, characters, good): 59 rv = [] 60 if not good or not name.endswith(";"): 61 rv.append("ParseError") 62 rv.append(["Character", characters]) 63 return rv 64 65 66 def make_test_list(entities): 67 tests = [] 68 for entity_name, characters in entities.items(): 69 if entity_name.endswith(";") and not subentity_exists(entity_name, entities): 70 tests.append((entity_name[:-1], "&" + entity_name[:-1], False)) 71 tests.append((entity_name, characters, True)) 72 return sorted(tests) 73 74 75 def subentity_exists(entity_name, entities): 76 for i in range(1, len(entity_name)): 77 if entity_name[:-i] in entities: 78 return True 79 return False 80 81 82 def make_entities_code(entities): 83 entities_text = "\n".join(" \"%s\": u\"%s\"," % ( 84 name, entities[name].encode( 85 "unicode-escape").replace("\"", "\\\"")) 86 for name in sorted(entities.keys())) 87 return """entities = { 88 %s 89 }""" % entities_text 90 91 92 def main(): 93 entities = entity_table(parse()) 94 tests_json = make_tests_json(entities) 95 json.dump(tests_json, open("namedEntities.test", "w"), indent=4) 96 code = make_entities_code(entities) 97 open("entities_constants.py", "w").write(code) 98 99 100 if __name__ == "__main__": 101 main()