tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filtration.py (16480B)


      1 # Copyright (C) 2018 and later: Unicode, Inc. and others.
      2 # License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 # Python 2/3 Compatibility (ICU-20299)
      5 # TODO(ICU-20301): Remove this.
      6 from __future__ import print_function
      7 
      8 from abc import abstractmethod
      9 from collections import defaultdict
     10 import re
     11 import sys
     12 
     13 from . import *
     14 from . import utils
     15 from .request_types import *
     16 
     17 
     18 # Note: for this to be a proper abstract class, it should extend abc.ABC.
     19 # There is no nice way to do this that works in both Python 2 and 3.
     20 # TODO(ICU-20301): Make this inherit from abc.ABC.
     21 class Filter(object):
     22    @staticmethod
     23    def create_from_json(json_data, io):
     24        assert io != None
     25        if "filterType" in json_data:
     26            filter_type = json_data["filterType"]
     27        else:
     28            filter_type = "file-stem"
     29 
     30        if filter_type == "file-stem":
     31            return FileStemFilter(json_data)
     32        elif filter_type == "language":
     33            return LanguageFilter(json_data)
     34        elif filter_type == "regex":
     35            return RegexFilter(json_data)
     36        elif filter_type == "exclude":
     37            return ExclusionFilter()
     38        elif filter_type == "union":
     39            return UnionFilter(json_data, io)
     40        elif filter_type == "intersection":
     41            return IntersectionFilter(json_data, io)
     42        elif filter_type == "complement":
     43            return ComplementFilter(json_data, io)
     44        elif filter_type == "locale":
     45            return LocaleFilter(json_data, io)
     46        else:
     47            print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr)
     48            return None
     49 
     50    def filter(self, request):
     51        if not request.apply_file_filter(self):
     52            return []
     53        for file in request.all_input_files():
     54            assert self.match(file)
     55        return [request]
     56 
     57    @staticmethod
     58    def _file_to_file_stem(file):
     59        start = file.filename.rfind("/")
     60        limit = file.filename.rfind(".")
     61        return file.filename[start+1:limit]
     62 
     63    @staticmethod
     64    def _file_to_subdir(file):
     65        limit = file.filename.rfind("/")
     66        if limit == -1:
     67            return None
     68        return file.filename[:limit]
     69 
     70    @abstractmethod
     71    def match(self, file):
     72        pass
     73 
     74 
     75 class InclusionFilter(Filter):
     76    def match(self, file):
     77        return True
     78 
     79 
     80 class ExclusionFilter(Filter):
     81    def match(self, file):
     82        return False
     83 
     84 
     85 class IncludeExcludeFilter(Filter):
     86    def __init__(self, json_data):
     87        if "whitelist" in json_data:
     88            self.is_includelist = True
     89            self.includelist = json_data["whitelist"]
     90        elif "includelist" in json_data:
     91            self.is_includelist = True
     92            self.includelist = json_data["includelist"]
     93        elif "blacklist" in json_data:
     94            self.is_includelist = False
     95            self.excludelist = json_data["blacklist"]
     96        elif "excludelist" in json_data:
     97            self.is_includelist = False
     98            self.excludelist = json_data["excludelist"]
     99        else:
    100            raise AssertionError("Need either includelist or excludelist: %s" % str(json_data))
    101 
    102    def match(self, file):
    103        file_stem = self._file_to_file_stem(file)
    104        return self._should_include(file_stem)
    105 
    106    @abstractmethod
    107    def _should_include(self, file_stem):
    108        pass
    109 
    110 
    111 class FileStemFilter(IncludeExcludeFilter):
    112    def _should_include(self, file_stem):
    113        if self.is_includelist:
    114            return file_stem in self.includelist
    115        else:
    116            return file_stem not in self.excludelist
    117 
    118 
    119 class LanguageFilter(IncludeExcludeFilter):
    120    def _should_include(self, file_stem):
    121        language = file_stem.split("_")[0]
    122        if language == "root":
    123            # Always include root.txt
    124            return True
    125        if self.is_includelist:
    126            return language in self.includelist
    127        else:
    128            return language not in self.excludelist
    129 
    130 
    131 class RegexFilter(IncludeExcludeFilter):
    132    def __init__(self, *args):
    133        # TODO(ICU-20301): Change this to: super().__init__(*args)
    134        super(RegexFilter, self).__init__(*args)
    135        if self.is_includelist:
    136            self.includelist = [re.compile(pat) for pat in self.includelist]
    137        else:
    138            self.excludelist = [re.compile(pat) for pat in self.excludelist]
    139 
    140    def _should_include(self, file_stem):
    141        if self.is_includelist:
    142            for pattern in self.includelist:
    143                if pattern.match(file_stem):
    144                    return True
    145            return False
    146        else:
    147            for pattern in self.excludelist:
    148                if pattern.match(file_stem):
    149                    return False
    150            return True
    151 
    152 
    153 class UnionFilter(Filter):
    154    def __init__(self, json_data, io):
    155        # Collect the sub-filters.
    156        self.sub_filters = []
    157        for filter_json in json_data["unionOf"]:
    158            self.sub_filters.append(Filter.create_from_json(filter_json, io))
    159 
    160    def match(self, file):
    161        """Match iff any of the sub-filters match."""
    162        for filter in self.sub_filters:
    163            if filter.match(file):
    164                return True
    165        return False
    166 
    167 
    168 class IntersectionFilter(Filter):
    169    def __init__(self, json_data, io):
    170        # Collect the sub-filters.
    171        self.sub_filters = []
    172        for filter_json in json_data["intersectionOf"]:
    173            self.sub_filters.append(Filter.create_from_json(filter_json, io))
    174 
    175    def match(self, file):
    176        """Match iff all of the sub-filters match."""
    177        for filter in self.sub_filters:
    178            if not filter.match(file):
    179                return False
    180        return True
    181 
    182 
    183 class ComplementFilter(Filter):
    184    def __init__(self, json_data, io):
    185        # There is only one sub-filter.
    186        filter_json = json_data["complementOf"]
    187        self.sub_filter = Filter.create_from_json(filter_json, io)
    188 
    189    def match(self, file):
    190        """Match iff the sub-filter does not match."""
    191        return not self.sub_filter.match(file)
    192 
    193 
    194 LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$")
    195 LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$")
    196 
    197 class LocaleFilter(Filter):
    198    def __init__(self, json_data, io):
    199        if "whitelist" in json_data:
    200            self.locales_requested = list(json_data["whitelist"])
    201        elif "includelist" in json_data:
    202            self.locales_requested = list(json_data["includelist"])
    203        else:
    204            raise AssertionError("You must have an includelist in a locale filter")
    205        self.include_children = json_data.get("includeChildren", True)
    206        self.include_scripts = json_data.get("includeScripts", False)
    207 
    208        # Load the dependency graph from disk
    209        self.dependency_data_by_tree = {
    210            tree: io.read_locale_deps(tree)
    211            for tree in utils.ALL_TREES
    212        }
    213 
    214    def match(self, file):
    215        tree = self._file_to_subdir(file)
    216        assert tree is not None
    217        locale = self._file_to_file_stem(file)
    218 
    219        # A locale is *required* if it is *requested* or an ancestor of a
    220        # *requested* locale.
    221        if locale in self._locales_required(tree):
    222            return True
    223 
    224        # Resolve include_scripts and include_children.
    225        return self._match_recursive(locale, tree)
    226 
    227    def _match_recursive(self, locale, tree):
    228        # Base case: return True if we reached a *requested* locale,
    229        # or False if we ascend out of the locale tree.
    230        if locale is None:
    231            return False
    232        if locale in self.locales_requested:
    233            return True
    234 
    235        # Check for alternative scripts.
    236        # This causes sr_Latn to check sr instead of going directly to root.
    237        if self.include_scripts:
    238            match = LANGUAGE_SCRIPT_REGEX.match(locale)
    239            if match and self._match_recursive(match.group(1), tree):
    240                return True
    241 
    242        # Check if we are a descendant of a *requested* locale.
    243        if self.include_children:
    244            parent = self._get_parent_locale(locale, tree)
    245            if self._match_recursive(parent, tree):
    246                return True
    247 
    248        # No matches.
    249        return False
    250 
    251    def _get_parent_locale(self, locale, tree):
    252        """Gets the parent locale in the given tree, according to dependency data."""
    253        dependency_data = self.dependency_data_by_tree[tree]
    254        if "parents" in dependency_data and locale in dependency_data["parents"]:
    255            return dependency_data["parents"][locale]
    256        if "aliases" in dependency_data and locale in dependency_data["aliases"]:
    257            return dependency_data["aliases"][locale]
    258        if LANGUAGE_ONLY_REGEX.match(locale):
    259            return "root"
    260        i = locale.rfind("_")
    261        if i < 0:
    262            assert locale == "root", "Invalid locale: %s/%s" % (tree, locale)
    263            return None
    264        return locale[:i]
    265 
    266    def _locales_required(self, tree):
    267        """Returns a generator of all required locales in the given tree."""
    268        for locale in self.locales_requested:
    269            while locale is not None:
    270                yield locale
    271                locale = self._get_parent_locale(locale, tree)
    272 
    273 
    274 def apply_filters(requests, config, io):
    275    """Runs the filters and returns a new list of requests."""
    276    requests = _apply_file_filters(requests, config, io)
    277    requests = _apply_resource_filters(requests, config, io)
    278    return requests
    279 
    280 
    281 def _apply_file_filters(old_requests, config, io):
    282    """Filters out entire files."""
    283    filters = _preprocess_file_filters(old_requests, config, io)
    284    new_requests = []
    285    for request in old_requests:
    286        category = request.category
    287        if category in filters:
    288            new_requests += filters[category].filter(request)
    289        else:
    290            new_requests.append(request)
    291    return new_requests
    292 
    293 
    294 def _preprocess_file_filters(requests, config, io):
    295    all_categories = set(
    296        request.category
    297        for request in requests
    298    )
    299    all_categories.remove(None)
    300    all_categories = list(sorted(all_categories))
    301    json_data = config.filters_json_data
    302    filters = {}
    303    default_filter_json = "exclude" if config.strategy == "additive" else "include"
    304    for category in all_categories:
    305        filter_json = default_filter_json
    306        # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now.
    307        if "brkitr_lstm" == category or "brkitr_adaboost" == category:
    308            filter_json = "exclude"
    309        # Figure out the correct filter to create for now.
    310        if "featureFilters" in json_data and category in json_data["featureFilters"]:
    311            filter_json = json_data["featureFilters"][category]
    312        if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"):
    313            filter_json = json_data["localeFilter"]
    314        # Resolve the filter JSON into a filter object
    315        if filter_json == "exclude":
    316            filters[category] = ExclusionFilter()
    317        elif filter_json == "include":
    318            pass  # no-op
    319        else:
    320            filters[category] = Filter.create_from_json(filter_json, io)
    321    if "featureFilters" in json_data:
    322        for category in json_data["featureFilters"]:
    323            if category not in all_categories:
    324                print("Warning: category %s is not known" % category, file=sys.stderr)
    325    return filters
    326 
    327 
    328 class ResourceFilterInfo(object):
    329    def __init__(self, category, strategy):
    330        self.category = category
    331        self.strategy = strategy
    332        self.filter_tmp_dir = "filters/%s" % category
    333        self.input_files = None
    334        self.filter_files = None
    335        self.rules_by_file = None
    336 
    337    def apply_to_requests(self, all_requests):
    338        # Call this method only once per list of requests.
    339        assert self.input_files is None
    340        for request in all_requests:
    341            if request.category != self.category:
    342                continue
    343            if not isinstance(request, AbstractExecutionRequest):
    344                continue
    345            if request.tool != IcuTool("genrb"):
    346                continue
    347            if not request.input_files:
    348                continue
    349            self._set_files(request.input_files)
    350            request.dep_targets += [self.filter_files[:]]
    351            arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir
    352            request.args = "%s %s" % (arg_str, request.args)
    353 
    354        # Make sure we found the target request
    355        if self.input_files is None:
    356            print("WARNING: Category not found: %s" % self.category, file=sys.stderr)
    357            self.input_files = []
    358            self.filter_files = []
    359            self.rules_by_file = []
    360 
    361    def _set_files(self, files):
    362        # Note: The input files to genrb for a certain category should always
    363        # be the same. For example, there are often two genrb calls: one for
    364        # --writePoolBundle, and the other for --usePoolBundle. They are both
    365        # expected to have the same list of input files.
    366        if self.input_files is not None:
    367            assert self.input_files == files
    368            return
    369        self.input_files = list(files)
    370        self.filter_files = [
    371            TmpFile("%s/%s" % (self.filter_tmp_dir, basename))
    372            for basename in (
    373                file.filename[file.filename.rfind("/")+1:]
    374                for file in files
    375            )
    376        ]
    377        if self.strategy == "additive":
    378            self.rules_by_file = [
    379                [r"-/", r"+/%%ALIAS", r"+/%%Parent"]
    380                for _ in range(len(files))
    381            ]
    382        else:
    383            self.rules_by_file = [
    384                [r"+/"]
    385                for _ in range(len(files))
    386            ]
    387 
    388    def add_rules(self, file_filter, rules):
    389        for file, rule_list in zip(self.input_files, self.rules_by_file):
    390            if file_filter.match(file):
    391                rule_list += rules
    392 
    393    def make_requests(self):
    394        # Map from rule list to filter files with that rule list
    395        unique_rules = defaultdict(list)
    396        for filter_file, rules in zip(self.filter_files, self.rules_by_file):
    397            unique_rules[tuple(rules)].append(filter_file)
    398 
    399        new_requests = []
    400        i = 0
    401        for rules, filter_files in unique_rules.items():
    402            base_filter_file = filter_files[0]
    403            new_requests += [
    404                PrintFileRequest(
    405                    name = "%s_print_%d" % (self.category, i),
    406                    output_file = base_filter_file,
    407                    content = self._generate_resource_filter_txt(rules)
    408                )
    409            ]
    410            i += 1
    411            for filter_file in filter_files[1:]:
    412                new_requests += [
    413                    CopyRequest(
    414                        name = "%s_copy_%d" % (self.category, i),
    415                        input_file = base_filter_file,
    416                        output_file = filter_file
    417                    )
    418                ]
    419                i += 1
    420        return new_requests
    421 
    422    @staticmethod
    423    def _generate_resource_filter_txt(rules):
    424        result = "# Caution: This file is automatically generated\n\n"
    425        result += "\n".join(rules)
    426        return result
    427 
    428 
    429 def _apply_resource_filters(all_requests, config, io):
    430    """Creates filters for looking within resource bundle files."""
    431    json_data = config.filters_json_data
    432    if "resourceFilters" not in json_data:
    433        return all_requests
    434 
    435    collected = {}
    436    for entry in json_data["resourceFilters"]:
    437        if "files" in entry:
    438            file_filter = Filter.create_from_json(entry["files"], io)
    439        else:
    440            file_filter = InclusionFilter()
    441        for category in entry["categories"]:
    442            # not defaultdict because we need to pass arguments to the constructor
    443            if category not in collected:
    444                filter_info = ResourceFilterInfo(category, config.strategy)
    445                filter_info.apply_to_requests(all_requests)
    446                collected[category] = filter_info
    447            else:
    448                filter_info = collected[category]
    449            filter_info.add_rules(file_filter, entry["rules"])
    450 
    451    # Add the filter generation requests to the beginning so that by default
    452    # they are made before genrb gets run (order is required by windirect)
    453    new_requests = []
    454    for filter_info in collected.values():
    455        new_requests += filter_info.make_requests()
    456    new_requests += all_requests
    457    return new_requests