tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

prepare_tlds.py (4417B)


      1 # This Source Code Form is subject to the terms of the Mozilla Public
      2 # License, v. 2.0. If a copy of the MPL was not distributed with this
      3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 
      5 import codecs
      6 import encodings.idna
      7 import re
      8 import sys
      9 
     10 from make_dafsa import words_to_bin, words_to_cxx
     11 
     12 """
     13 Processes a file containing effective TLD data.  See the following URL for a
     14 description of effective TLDs and of the file format that this script
     15 processes (although for the latter you're better off just reading this file's
     16 short source code).
     17 
     18 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
     19 """
     20 
     21 
     22 def getEffectiveTLDs(path):
     23    with codecs.open(path, "r", "UTF-8") as file:
     24        domains = set()
     25        for line in file:
     26            # line always contains a line terminator unless the file is empty
     27            if len(line) == 0:
     28                raise StopIteration
     29            line = line.rstrip()
     30            # comment, empty, or superfluous line for explicitness purposes
     31            if line.startswith("//") or not line.strip():
     32                continue
     33            line = re.split(r"[ \t\n]", line, 1)[0]
     34            entry = EffectiveTLDEntry(line)
     35            domain = entry.domain()
     36            assert domain not in domains, "repeating domain %s makes no sense" % domain
     37            domains.add(domain)
     38            yield entry
     39 
     40 
     41 def _normalizeHostname(domain):
     42    """
     43    Normalizes the given domain, component by component.  ASCII components are
     44    lowercased, while non-ASCII components are processed using the ToASCII
     45    algorithm.
     46    """
     47 
     48    def convertLabel(label):
     49        if _isASCII(label):
     50            return label.lower()
     51        return encodings.idna.ToASCII(label).decode("utf-8")
     52 
     53    return ".".join(map(convertLabel, domain.split(".")))
     54 
     55 
     56 def _isASCII(s):
     57    "True if s consists entirely of ASCII characters, false otherwise."
     58    for c in s:
     59        if ord(c) > 127:
     60            return False
     61    return True
     62 
     63 
     64 class EffectiveTLDEntry:
     65    """
     66    Stores an entry in an effective-TLD name file.
     67    """
     68 
     69    _exception = False
     70    _wild = False
     71 
     72    def __init__(self, line):
     73        """
     74        Creates a TLD entry from a line of data, which must have been stripped of
     75        the line ending.
     76        """
     77        if line.startswith("!"):
     78            self._exception = True
     79            domain = line[1:]
     80        elif line.startswith("*."):
     81            self._wild = True
     82            domain = line[2:]
     83        else:
     84            domain = line
     85        self._domain = _normalizeHostname(domain)
     86 
     87    def domain(self):
     88        "The domain this represents."
     89        return self._domain
     90 
     91    def exception(self):
     92        "True if this entry's domain denotes does not denote an effective TLD."
     93        return self._exception
     94 
     95    def wild(self):
     96        "True if this entry represents a class of effective TLDs."
     97        return self._wild
     98 
     99 
    100 #################
    101 # DO EVERYTHING #
    102 #################
    103 
    104 
    105 def main(output, effective_tld_filename, output_format="cxx"):
    106    """
    107    effective_tld_filename is the effective TLD file to parse.
    108    based on the output format, either a C++ array of a binary representation
    109    of a DAFSA representing the eTLD file is then printed to standard output
    110    or a binary file is written to disk.
    111    """
    112 
    113    def typeEnum(etld):
    114        """
    115        Maps the flags to the DAFSA's enum types.
    116        """
    117        if etld.exception():
    118            return 1
    119        elif etld.wild():
    120            return 2
    121        else:
    122            return 0
    123 
    124    def dafsa_words():
    125        """
    126        make_dafsa expects lines of the form "<domain_name><enum_value>"
    127        """
    128        for etld in getEffectiveTLDs(effective_tld_filename):
    129            yield "%s%d" % (etld.domain(), typeEnum(etld))
    130 
    131    """ words_to_bin() returns a bytes while words_to_cxx() returns string """
    132    if output_format == "bin":
    133        output.write(words_to_bin(dafsa_words()))
    134    else:
    135        output.write(words_to_cxx(dafsa_words()))
    136 
    137 
    138 if __name__ == "__main__":
    139    """
    140    This program can output the DAFSA in two formats:
    141    as C++ code that will be included and compiled at build time
    142    or as a binary file that will be published in Remote Settings.
    143 
    144    Flags for format options:
    145    "cxx" -> C++ array [default]
    146    "bin" -> Binary file
    147    """
    148 
    149    output_format = "bin" if "--bin" in sys.argv else "cxx"
    150    main(sys.stdout, sys.argv[1], output_format=output_format)