IdnaTestV2-parser.py (6930B)
1 # This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the 2 # URL Standard. 3 # 4 # The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to 5 # these issues: 6 # 7 # * https://github.com/whatwg/url/issues/341 8 # * https://github.com/whatwg/url/issues/543 9 # * https://github.com/whatwg/url/issues/733 10 # * https://github.com/whatwg/url/issues/744 11 # 12 # Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues. 13 14 import argparse 15 import json 16 import os 17 import re 18 import urllib.request 19 20 def get_IdnaTestV2_lines(): 21 IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt") 22 if not os.path.exists(IdnaTestV2): 23 # Download IdnaTestV2.txt if it doesn't exist yet 24 url = "https://unicode.org/Public/idna/latest/IdnaTestV2.txt" 25 content = urllib.request.urlopen(url).read() 26 open(IdnaTestV2, "wb").write(content) 27 return open(IdnaTestV2, "r", encoding="utf-8").readlines() 28 29 def remove_escapes(input): 30 return json.loads("\"" + input + "\"") 31 32 def get_column_value(input, default = ""): 33 if input == "": 34 return default 35 # "" means an empty string 36 if input == "\"\"": 37 return "" 38 # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source) 39 return remove_escapes(input) 40 41 def ends_in_a_number(input): 42 # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there 43 # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding. 44 # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It 45 # appears to suffice for the tests in question though. 46 parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input) 47 if not parts: 48 return False 49 if parts[-1] == "": 50 if len(parts) == 1: 51 return False 52 parts.pop() 53 return parts[-1].isascii() and parts[-1].isdigit() 54 55 def contains_bidi_status(statuses): 56 for status in statuses: 57 if status in ["B1", "B2", "B3", "B4", "B5", "B6"]: 58 return True 59 return False 60 61 def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi): 62 # Main quest. 63 output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."] 64 output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude-bidi: {exclude_bidi}") 65 66 # Side quest. 67 unique_statuses = [] 68 69 for line in lines: 70 # Remove newlines 71 line = line.rstrip() 72 73 # Remove lines that are comments or empty 74 if line.startswith("#") or line == "": 75 continue 76 77 # Normalize columns 78 # 79 # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care 80 # about the following columns: 81 # 82 # * Column 1 (source) 83 # * Column 4 (toAsciiN) 84 # * Column 5 (toAsciiNStatus) 85 # 86 # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion. 87 columns = [column.strip() for column in line.split(";")] 88 89 # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source)) 90 source = get_column_value(columns[0]) 91 to_unicode = get_column_value(columns[1], source) 92 93 # Immediately exclude IPv4-like tests when desired. While we could force all their 94 # expectations to be failure instead, it's not clear we need that many additional tests that 95 # were actually trying to test something else. 96 if exclude_ipv4_like: 97 if ends_in_a_number(source): 98 continue 99 100 if exclude_std3: 101 if re.search(r"\<|\>|\:|\/|\?|\#|\\", to_unicode): 102 continue 103 104 # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode)) 105 to_ascii = get_column_value(columns[3], to_unicode) 106 107 # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus)) 108 temp_statuses = columns[4] 109 if temp_statuses == "": 110 temp_statuses = columns[2] 111 112 statuses = [] 113 if temp_statuses != "": 114 assert temp_statuses.startswith("[") 115 statuses = [status.strip() for status in temp_statuses[1:-1].split(",")] 116 117 # Side quest time. 118 for status in statuses: 119 if status not in unique_statuses: 120 unique_statuses.append(status) 121 122 # The URL Standard has 123 # 124 # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though) 125 # * CheckHyphens=false; thus ignore V2, V3? 126 # * VerifyDnsLength=false; thus ignore A4_1 and A4_2 127 ignored_statuses = [] 128 for status in statuses: 129 if status in ["A4_1", "A4_2", "U1", "V2", "V3"]: 130 ignored_statuses.append(status) 131 for status in ignored_statuses: 132 statuses.remove(status) 133 134 if exclude_bidi and contains_bidi_status(statuses): 135 continue 136 137 if len(statuses) > 0: 138 to_ascii = None 139 140 test = { "input": source, "output": to_ascii } 141 comment = "" 142 for status in statuses: 143 comment += status + "; " 144 for status in ignored_statuses: 145 comment += status + " (ignored); " 146 if comment != "": 147 test["comment"] = comment.strip()[:-1] 148 output.append(test) 149 150 unique_statuses.sort() 151 return { "tests": output, "unique_statuses": unique_statuses } 152 153 def to_json(data): 154 handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w", encoding="utf-8") 155 handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': '))) 156 handle.write("\n") 157 handle.close() 158 159 def main(): 160 parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!") 161 parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.") 162 parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)") 163 parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)") 164 parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.") 165 parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.") 166 args = parser.parse_args() 167 168 if args.generate or args.statuses: 169 output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi) 170 if args.statuses: 171 print(output["unique_statuses"]) 172 else: 173 assert args.generate 174 to_json(output["tests"]) 175 else: 176 parser.print_usage() 177 178 main()