update_html5lib_tests.py (6997B)
1 import glob 2 import hashlib 3 import itertools 4 import json 5 import os 6 import re 7 import shutil 8 import site 9 import subprocess 10 import sys 11 import tempfile 12 import urllib 13 from importlib import reload 14 15 16 import genshi 17 from genshi.template import MarkupTemplate 18 19 20 TESTS_PATH = "html/syntax/parsing/" 21 22 def get_paths(): 23 script_path = os.path.dirname(os.path.abspath(__file__)) 24 repo_base = get_repo_base(script_path) 25 tests_path = os.path.join(repo_base, TESTS_PATH) 26 return script_path, tests_path 27 28 29 def get_repo_base(path): 30 while path: 31 if os.path.exists(os.path.join(path, ".git")): 32 return path 33 else: 34 path = os.path.dirname(path) 35 36 37 def get_expected(data): 38 data = "#document\n" + data 39 return data 40 41 42 def get_hash(data, container=None): 43 if container == None: 44 container = "" 45 return hashlib.sha1(b"#container%s#data%s"%(container.encode("utf8"), 46 data.encode("utf8"))).hexdigest() 47 48 49 class Html5libInstall: 50 def __init__(self, rev=None, tests_rev=None): 51 self.html5lib_dir = None 52 self.rev = rev 53 self.tests_rev = tests_rev 54 55 def __enter__(self): 56 self.html5lib_dir = tempfile.TemporaryDirectory() 57 html5lib_path = self.html5lib_dir.__enter__() 58 html5lib_python_path = os.path.join(html5lib_path, "html5lib") 59 html5lib_tests_path = os.path.join( 60 html5lib_python_path, "html5lib", "tests", "testdata" 61 ) 62 63 subprocess.check_call( 64 [ 65 "git", 66 "clone", 67 "--no-checkout", 68 "https://github.com/html5lib/html5lib-python.git", 69 "html5lib", 70 ], 71 cwd=html5lib_path, 72 ) 73 74 rev = self.rev if self.rev is not None else "origin/master" 75 subprocess.check_call( 76 ["git", "checkout", rev], cwd=html5lib_python_path 77 ) 78 79 subprocess.check_call( 80 [ 81 "git", 82 "submodule", 83 "update", 84 "--init", 85 "--recursive", 86 ], 87 cwd=html5lib_python_path, 88 ) 89 90 subprocess.check_call(["pip", "install", "-e", "html5lib"], cwd=html5lib_path) 91 reload(site) 92 93 tests_rev = self.tests_rev if self.tests_rev is not None else "origin/master" 94 subprocess.check_call(["git", "checkout", tests_rev], cwd=html5lib_tests_path) 95 96 def __exit__(self, *args, **kwargs): 97 subprocess.call(["pip", "uninstall", "-y", "html5lib"], cwd=self.html5lib_dir.name) 98 self.html5lib_dir.__exit__(*args, **kwargs) 99 self.html5lib_dir = None 100 101 102 def make_tests(script_dir, out_dir, input_file_name, test_data): 103 tests = [] 104 innerHTML_tests = [] 105 ids_seen = {} 106 print(input_file_name) 107 for test in test_data: 108 if "script-off" in test: 109 continue 110 is_innerHTML = "document-fragment" in test 111 data = test["data"] 112 container = test["document-fragment"] if is_innerHTML else None 113 assert test["document"], test 114 expected = get_expected(test["document"]) 115 test_list = innerHTML_tests if is_innerHTML else tests 116 test_id = get_hash(data, container) 117 if test_id in ids_seen: 118 print("WARNING: id %s seen multiple times in file %s this time for test (%s, %s) before for test %s, skipping"%(test_id, input_file_name, container, data, ids_seen[test_id])) 119 continue 120 ids_seen[test_id] = (container, data) 121 test_list.append({'string_uri_encoded_input':"\"%s\""%urllib.parse.quote(data.encode("utf8")), 122 'input':data, 123 'expected':expected, 124 'string_escaped_expected':json.dumps(urllib.parse.quote(expected.encode("utf8"))), 125 'id':test_id, 126 'container':container 127 }) 128 path_normal = None 129 if tests: 130 path_normal = write_test_file(script_dir, out_dir, 131 tests, "html5lib_%s"%input_file_name, 132 "html5lib_test.xml") 133 path_innerHTML = None 134 if innerHTML_tests: 135 path_innerHTML = write_test_file(script_dir, out_dir, 136 innerHTML_tests, "html5lib_innerHTML_%s"%input_file_name, 137 "html5lib_test_fragment.xml") 138 139 return path_normal, path_innerHTML 140 141 def write_test_file(script_dir, out_dir, tests, file_name, template_file_name): 142 file_name = os.path.join(out_dir, file_name + ".html") 143 short_name = os.path.basename(file_name) 144 145 with open(os.path.join(script_dir, template_file_name), "r") as f: 146 template = MarkupTemplate(f) 147 148 stream = template.generate(file_name=short_name, tests=tests) 149 150 with open(file_name, "w") as f: 151 f.write(str(stream.render('html', doctype='html5', 152 encoding="utf8"), "utf-8")) 153 return file_name 154 155 def escape_js_string(in_data): 156 return in_data.encode("utf8").encode("string-escape") 157 158 def serialize_filenames(test_filenames): 159 return "[" + ",\n".join("\"%s\""%item for item in test_filenames) + "]" 160 161 def main(): 162 script_dir, out_dir = get_paths() 163 164 test_files = [] 165 inner_html_files = [] 166 with open(os.path.join(script_dir, "html5lib_python_revision"), "r") as f: 167 html5lib_rev = f.read().strip() 168 169 with open(os.path.join(script_dir, "html5lib_tests_revision"), "r") as f: 170 html5lib_tests_rev = f.read().strip() 171 172 with Html5libInstall(html5lib_rev, html5lib_tests_rev): 173 from html5lib.tests import support 174 175 if len(sys.argv) > 2: 176 test_iterator = zip( 177 itertools.repeat(False), 178 sorted(os.path.abspath(item) for item in 179 glob.glob(os.path.join(sys.argv[2], "*.dat")))) 180 else: 181 test_iterator = itertools.chain( 182 zip(itertools.repeat(False), 183 sorted(support.get_data_files("tree-construction"))), 184 zip(itertools.repeat(True), 185 sorted(support.get_data_files( 186 os.path.join("tree-construction", "scripted"))))) 187 188 for (scripted, test_file) in test_iterator: 189 input_file_name = os.path.splitext(os.path.basename(test_file))[0] 190 if scripted: 191 input_file_name = "scripted_" + input_file_name 192 test_data = support.TestData(test_file) 193 test_filename, inner_html_file_name = make_tests(script_dir, out_dir, 194 input_file_name, test_data) 195 if test_filename is not None: 196 test_files.append(test_filename) 197 if inner_html_file_name is not None: 198 inner_html_files.append(inner_html_file_name) 199 200 if __name__ == "__main__": 201 main()