[ tor-browser ].git.dasho

IdnaTestV2-parser.py (6930B)
      1 # This script can convert IdnaTestV2.txt to JSON, accounting for the requirements in the
      2 # URL Standard.
      3 #
      4 # The goal is to eventually remove --exclude-std3 and --exclude-bidi. For that we need solutions to
      5 # these issues:
      6 #
      7 # * https://github.com/whatwg/url/issues/341
      8 # * https://github.com/whatwg/url/issues/543
      9 # * https://github.com/whatwg/url/issues/733
     10 # * https://github.com/whatwg/url/issues/744
     11 #
     12 # Removal of --exclude-ipv4-like is a stretch goal also dependent upon those issues.
     13 
     14 import argparse
     15 import json
     16 import os
     17 import re
     18 import urllib.request
     19 
     20 def get_IdnaTestV2_lines():
     21    IdnaTestV2 = os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt")
     22    if not os.path.exists(IdnaTestV2):
     23        # Download IdnaTestV2.txt if it doesn't exist yet
     24        url = "https://unicode.org/Public/idna/latest/IdnaTestV2.txt"
     25        content = urllib.request.urlopen(url).read()
     26        open(IdnaTestV2, "wb").write(content)
     27    return open(IdnaTestV2, "r", encoding="utf-8").readlines()
     28 
     29 def remove_escapes(input):
     30    return json.loads("\"" + input + "\"")
     31 
     32 def get_column_value(input, default = ""):
     33    if input == "":
     34        return default
     35    # "" means an empty string
     36    if input == "\"\"":
     37        return ""
     38    # Remove escapes (doesn't handle \x{XXXX} but those do not appear in the source)
     39    return remove_escapes(input)
     40 
     41 def ends_in_a_number(input):
     42    # This method is not robust. It uses https://www.unicode.org/reports/tr46/#Notation but there
     43    # are likely other ways to end up with a dot, e.g., through decomposition or percent-decoding.
     44    # It also does not entirely match https://url.spec.whatwg.org/#ends-in-a-number-checker. It
     45    # appears to suffice for the tests in question though.
     46    parts = re.split(r"\u002E|\uFF0E|\u3002|\uFF61", input)
     47    if not parts:
     48        return False
     49    if parts[-1] == "":
     50        if len(parts) == 1:
     51            return False
     52        parts.pop()
     53    return parts[-1].isascii() and parts[-1].isdigit()
     54 
     55 def contains_bidi_status(statuses):
     56    for status in statuses:
     57        if status in ["B1", "B2", "B3", "B4", "B5", "B6"]:
     58            return True
     59    return False
     60 
     61 def parse(lines, exclude_ipv4_like, exclude_std3, exclude_bidi):
     62    # Main quest.
     63    output = ["THIS IS A GENERATED FILE. PLEASE DO NOT MODIFY DIRECTLY. See ../tools/IdnaTestV2-parser.py instead."]
     64    output.append(f"--exclude-ipv4-like: {exclude_ipv4_like}; --exclude-std3: {exclude_std3}; --exclude-bidi: {exclude_bidi}")
     65 
     66    # Side quest.
     67    unique_statuses = []
     68 
     69    for line in lines:
     70        # Remove newlines
     71        line = line.rstrip()
     72 
     73        # Remove lines that are comments or empty
     74        if line.startswith("#") or line == "":
     75            continue
     76 
     77        # Normalize columns
     78        #
     79        # Since we are only interested in ToASCII and enforce Transitional_Processing=false we care
     80        # about the following columns:
     81        #
     82        # * Column 1 (source)
     83        # * Column 4 (toAsciiN)
     84        # * Column 5 (toAsciiNStatus)
     85        #
     86        # We also store Column 2 (toUnicode) to help with UseSTD3ASCIIRules exclusion.
     87        columns = [column.strip() for column in line.split(";")]
     88 
     89        # Column 1 (source) and Column 2 (toUnicode; if empty, Column 1 (source))
     90        source = get_column_value(columns[0])
     91        to_unicode = get_column_value(columns[1], source)
     92 
     93        # Immediately exclude IPv4-like tests when desired. While we could force all their
     94        # expectations to be failure instead, it's not clear we need that many additional tests that
     95        # were actually trying to test something else.
     96        if exclude_ipv4_like:
     97            if ends_in_a_number(source):
     98                continue
     99 
    100        if exclude_std3:
    101            if re.search(r"\<|\>|\:|\/|\?|\#|\\", to_unicode):
    102                continue
    103 
    104        # Column 4 (toAsciiN; if empty, use Column 2 (toUnicode))
    105        to_ascii = get_column_value(columns[3], to_unicode)
    106 
    107        # Column 5 (toAsciiNStatus; if empty, use Column 3 (toUnicodeStatus))
    108        temp_statuses = columns[4]
    109        if temp_statuses == "":
    110            temp_statuses = columns[2]
    111 
    112        statuses = []
    113        if temp_statuses != "":
    114            assert temp_statuses.startswith("[")
    115            statuses = [status.strip() for status in temp_statuses[1:-1].split(",")]
    116 
    117        # Side quest time.
    118        for status in statuses:
    119            if status not in unique_statuses:
    120                unique_statuses.append(status)
    121 
    122        # The URL Standard has
    123        #
    124        # * UseSTD3ASCIIRules=false; however there are no tests marked U1 (some should be though)
    125        # * CheckHyphens=false; thus ignore V2, V3?
    126        # * VerifyDnsLength=false; thus ignore A4_1 and A4_2
    127        ignored_statuses = []
    128        for status in statuses:
    129            if status in ["A4_1", "A4_2", "U1", "V2", "V3"]:
    130                ignored_statuses.append(status)
    131        for status in ignored_statuses:
    132            statuses.remove(status)
    133 
    134        if exclude_bidi and contains_bidi_status(statuses):
    135            continue
    136 
    137        if len(statuses) > 0:
    138            to_ascii = None
    139 
    140        test = { "input": source, "output": to_ascii }
    141        comment = ""
    142        for status in statuses:
    143            comment += status + "; "
    144        for status in ignored_statuses:
    145            comment += status + " (ignored); "
    146        if comment != "":
    147            test["comment"] = comment.strip()[:-1]
    148        output.append(test)
    149 
    150    unique_statuses.sort()
    151    return { "tests": output, "unique_statuses": unique_statuses }
    152 
    153 def to_json(data):
    154    handle = open(os.path.join(os.path.dirname(__file__), "../resources/IdnaTestV2.json"), "w", encoding="utf-8")
    155    handle.write(json.dumps(data, sort_keys=True, allow_nan=False, indent=2, separators=(',', ': ')))
    156    handle.write("\n")
    157    handle.close()
    158 
    159 def main():
    160    parser = argparse.ArgumentParser(epilog="Thanks for caring about IDNA!")
    161    parser.add_argument("--generate", action="store_true", help="Generate the JSON resource.")
    162    parser.add_argument("--exclude-ipv4-like", action="store_true", help="Exclude inputs that end with an ASCII digit label. (Not robust, but works for current input.)")
    163    parser.add_argument("--exclude-std3", action="store_true", help="Exclude tests impacted by UseSTD3ASCIIRules. (Not robust, but works for current input.)")
    164    parser.add_argument("--exclude-bidi", action="store_true", help="Exclude tests impacted by CheckBidi.")
    165    parser.add_argument("--statuses", action="store_true", help="Print the unique statuses in IdnaTestV2.txt.")
    166    args = parser.parse_args()
    167 
    168    if args.generate or args.statuses:
    169        output = parse(get_IdnaTestV2_lines(), args.exclude_ipv4_like, args.exclude_std3, args.exclude_bidi)
    170        if args.statuses:
    171            print(output["unique_statuses"])
    172        else:
    173            assert args.generate
    174            to_json(output["tests"])
    175    else:
    176        parser.print_usage()
    177 
    178 main()
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE