filtration.py (16480B)
1 # Copyright (C) 2018 and later: Unicode, Inc. and others. 2 # License & terms of use: http://www.unicode.org/copyright.html 3 4 # Python 2/3 Compatibility (ICU-20299) 5 # TODO(ICU-20301): Remove this. 6 from __future__ import print_function 7 8 from abc import abstractmethod 9 from collections import defaultdict 10 import re 11 import sys 12 13 from . import * 14 from . import utils 15 from .request_types import * 16 17 18 # Note: for this to be a proper abstract class, it should extend abc.ABC. 19 # There is no nice way to do this that works in both Python 2 and 3. 20 # TODO(ICU-20301): Make this inherit from abc.ABC. 21 class Filter(object): 22 @staticmethod 23 def create_from_json(json_data, io): 24 assert io != None 25 if "filterType" in json_data: 26 filter_type = json_data["filterType"] 27 else: 28 filter_type = "file-stem" 29 30 if filter_type == "file-stem": 31 return FileStemFilter(json_data) 32 elif filter_type == "language": 33 return LanguageFilter(json_data) 34 elif filter_type == "regex": 35 return RegexFilter(json_data) 36 elif filter_type == "exclude": 37 return ExclusionFilter() 38 elif filter_type == "union": 39 return UnionFilter(json_data, io) 40 elif filter_type == "intersection": 41 return IntersectionFilter(json_data, io) 42 elif filter_type == "complement": 43 return ComplementFilter(json_data, io) 44 elif filter_type == "locale": 45 return LocaleFilter(json_data, io) 46 else: 47 print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) 48 return None 49 50 def filter(self, request): 51 if not request.apply_file_filter(self): 52 return [] 53 for file in request.all_input_files(): 54 assert self.match(file) 55 return [request] 56 57 @staticmethod 58 def _file_to_file_stem(file): 59 start = file.filename.rfind("/") 60 limit = file.filename.rfind(".") 61 return file.filename[start+1:limit] 62 63 @staticmethod 64 def _file_to_subdir(file): 65 limit = file.filename.rfind("/") 66 if limit == -1: 67 return None 68 return file.filename[:limit] 69 70 @abstractmethod 71 def match(self, file): 72 pass 73 74 75 class InclusionFilter(Filter): 76 def match(self, file): 77 return True 78 79 80 class ExclusionFilter(Filter): 81 def match(self, file): 82 return False 83 84 85 class IncludeExcludeFilter(Filter): 86 def __init__(self, json_data): 87 if "whitelist" in json_data: 88 self.is_includelist = True 89 self.includelist = json_data["whitelist"] 90 elif "includelist" in json_data: 91 self.is_includelist = True 92 self.includelist = json_data["includelist"] 93 elif "blacklist" in json_data: 94 self.is_includelist = False 95 self.excludelist = json_data["blacklist"] 96 elif "excludelist" in json_data: 97 self.is_includelist = False 98 self.excludelist = json_data["excludelist"] 99 else: 100 raise AssertionError("Need either includelist or excludelist: %s" % str(json_data)) 101 102 def match(self, file): 103 file_stem = self._file_to_file_stem(file) 104 return self._should_include(file_stem) 105 106 @abstractmethod 107 def _should_include(self, file_stem): 108 pass 109 110 111 class FileStemFilter(IncludeExcludeFilter): 112 def _should_include(self, file_stem): 113 if self.is_includelist: 114 return file_stem in self.includelist 115 else: 116 return file_stem not in self.excludelist 117 118 119 class LanguageFilter(IncludeExcludeFilter): 120 def _should_include(self, file_stem): 121 language = file_stem.split("_")[0] 122 if language == "root": 123 # Always include root.txt 124 return True 125 if self.is_includelist: 126 return language in self.includelist 127 else: 128 return language not in self.excludelist 129 130 131 class RegexFilter(IncludeExcludeFilter): 132 def __init__(self, *args): 133 # TODO(ICU-20301): Change this to: super().__init__(*args) 134 super(RegexFilter, self).__init__(*args) 135 if self.is_includelist: 136 self.includelist = [re.compile(pat) for pat in self.includelist] 137 else: 138 self.excludelist = [re.compile(pat) for pat in self.excludelist] 139 140 def _should_include(self, file_stem): 141 if self.is_includelist: 142 for pattern in self.includelist: 143 if pattern.match(file_stem): 144 return True 145 return False 146 else: 147 for pattern in self.excludelist: 148 if pattern.match(file_stem): 149 return False 150 return True 151 152 153 class UnionFilter(Filter): 154 def __init__(self, json_data, io): 155 # Collect the sub-filters. 156 self.sub_filters = [] 157 for filter_json in json_data["unionOf"]: 158 self.sub_filters.append(Filter.create_from_json(filter_json, io)) 159 160 def match(self, file): 161 """Match iff any of the sub-filters match.""" 162 for filter in self.sub_filters: 163 if filter.match(file): 164 return True 165 return False 166 167 168 class IntersectionFilter(Filter): 169 def __init__(self, json_data, io): 170 # Collect the sub-filters. 171 self.sub_filters = [] 172 for filter_json in json_data["intersectionOf"]: 173 self.sub_filters.append(Filter.create_from_json(filter_json, io)) 174 175 def match(self, file): 176 """Match iff all of the sub-filters match.""" 177 for filter in self.sub_filters: 178 if not filter.match(file): 179 return False 180 return True 181 182 183 class ComplementFilter(Filter): 184 def __init__(self, json_data, io): 185 # There is only one sub-filter. 186 filter_json = json_data["complementOf"] 187 self.sub_filter = Filter.create_from_json(filter_json, io) 188 189 def match(self, file): 190 """Match iff the sub-filter does not match.""" 191 return not self.sub_filter.match(file) 192 193 194 LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") 195 LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") 196 197 class LocaleFilter(Filter): 198 def __init__(self, json_data, io): 199 if "whitelist" in json_data: 200 self.locales_requested = list(json_data["whitelist"]) 201 elif "includelist" in json_data: 202 self.locales_requested = list(json_data["includelist"]) 203 else: 204 raise AssertionError("You must have an includelist in a locale filter") 205 self.include_children = json_data.get("includeChildren", True) 206 self.include_scripts = json_data.get("includeScripts", False) 207 208 # Load the dependency graph from disk 209 self.dependency_data_by_tree = { 210 tree: io.read_locale_deps(tree) 211 for tree in utils.ALL_TREES 212 } 213 214 def match(self, file): 215 tree = self._file_to_subdir(file) 216 assert tree is not None 217 locale = self._file_to_file_stem(file) 218 219 # A locale is *required* if it is *requested* or an ancestor of a 220 # *requested* locale. 221 if locale in self._locales_required(tree): 222 return True 223 224 # Resolve include_scripts and include_children. 225 return self._match_recursive(locale, tree) 226 227 def _match_recursive(self, locale, tree): 228 # Base case: return True if we reached a *requested* locale, 229 # or False if we ascend out of the locale tree. 230 if locale is None: 231 return False 232 if locale in self.locales_requested: 233 return True 234 235 # Check for alternative scripts. 236 # This causes sr_Latn to check sr instead of going directly to root. 237 if self.include_scripts: 238 match = LANGUAGE_SCRIPT_REGEX.match(locale) 239 if match and self._match_recursive(match.group(1), tree): 240 return True 241 242 # Check if we are a descendant of a *requested* locale. 243 if self.include_children: 244 parent = self._get_parent_locale(locale, tree) 245 if self._match_recursive(parent, tree): 246 return True 247 248 # No matches. 249 return False 250 251 def _get_parent_locale(self, locale, tree): 252 """Gets the parent locale in the given tree, according to dependency data.""" 253 dependency_data = self.dependency_data_by_tree[tree] 254 if "parents" in dependency_data and locale in dependency_data["parents"]: 255 return dependency_data["parents"][locale] 256 if "aliases" in dependency_data and locale in dependency_data["aliases"]: 257 return dependency_data["aliases"][locale] 258 if LANGUAGE_ONLY_REGEX.match(locale): 259 return "root" 260 i = locale.rfind("_") 261 if i < 0: 262 assert locale == "root", "Invalid locale: %s/%s" % (tree, locale) 263 return None 264 return locale[:i] 265 266 def _locales_required(self, tree): 267 """Returns a generator of all required locales in the given tree.""" 268 for locale in self.locales_requested: 269 while locale is not None: 270 yield locale 271 locale = self._get_parent_locale(locale, tree) 272 273 274 def apply_filters(requests, config, io): 275 """Runs the filters and returns a new list of requests.""" 276 requests = _apply_file_filters(requests, config, io) 277 requests = _apply_resource_filters(requests, config, io) 278 return requests 279 280 281 def _apply_file_filters(old_requests, config, io): 282 """Filters out entire files.""" 283 filters = _preprocess_file_filters(old_requests, config, io) 284 new_requests = [] 285 for request in old_requests: 286 category = request.category 287 if category in filters: 288 new_requests += filters[category].filter(request) 289 else: 290 new_requests.append(request) 291 return new_requests 292 293 294 def _preprocess_file_filters(requests, config, io): 295 all_categories = set( 296 request.category 297 for request in requests 298 ) 299 all_categories.remove(None) 300 all_categories = list(sorted(all_categories)) 301 json_data = config.filters_json_data 302 filters = {} 303 default_filter_json = "exclude" if config.strategy == "additive" else "include" 304 for category in all_categories: 305 filter_json = default_filter_json 306 # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now. 307 if "brkitr_lstm" == category or "brkitr_adaboost" == category: 308 filter_json = "exclude" 309 # Figure out the correct filter to create for now. 310 if "featureFilters" in json_data and category in json_data["featureFilters"]: 311 filter_json = json_data["featureFilters"][category] 312 if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"): 313 filter_json = json_data["localeFilter"] 314 # Resolve the filter JSON into a filter object 315 if filter_json == "exclude": 316 filters[category] = ExclusionFilter() 317 elif filter_json == "include": 318 pass # no-op 319 else: 320 filters[category] = Filter.create_from_json(filter_json, io) 321 if "featureFilters" in json_data: 322 for category in json_data["featureFilters"]: 323 if category not in all_categories: 324 print("Warning: category %s is not known" % category, file=sys.stderr) 325 return filters 326 327 328 class ResourceFilterInfo(object): 329 def __init__(self, category, strategy): 330 self.category = category 331 self.strategy = strategy 332 self.filter_tmp_dir = "filters/%s" % category 333 self.input_files = None 334 self.filter_files = None 335 self.rules_by_file = None 336 337 def apply_to_requests(self, all_requests): 338 # Call this method only once per list of requests. 339 assert self.input_files is None 340 for request in all_requests: 341 if request.category != self.category: 342 continue 343 if not isinstance(request, AbstractExecutionRequest): 344 continue 345 if request.tool != IcuTool("genrb"): 346 continue 347 if not request.input_files: 348 continue 349 self._set_files(request.input_files) 350 request.dep_targets += [self.filter_files[:]] 351 arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir 352 request.args = "%s %s" % (arg_str, request.args) 353 354 # Make sure we found the target request 355 if self.input_files is None: 356 print("WARNING: Category not found: %s" % self.category, file=sys.stderr) 357 self.input_files = [] 358 self.filter_files = [] 359 self.rules_by_file = [] 360 361 def _set_files(self, files): 362 # Note: The input files to genrb for a certain category should always 363 # be the same. For example, there are often two genrb calls: one for 364 # --writePoolBundle, and the other for --usePoolBundle. They are both 365 # expected to have the same list of input files. 366 if self.input_files is not None: 367 assert self.input_files == files 368 return 369 self.input_files = list(files) 370 self.filter_files = [ 371 TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) 372 for basename in ( 373 file.filename[file.filename.rfind("/")+1:] 374 for file in files 375 ) 376 ] 377 if self.strategy == "additive": 378 self.rules_by_file = [ 379 [r"-/", r"+/%%ALIAS", r"+/%%Parent"] 380 for _ in range(len(files)) 381 ] 382 else: 383 self.rules_by_file = [ 384 [r"+/"] 385 for _ in range(len(files)) 386 ] 387 388 def add_rules(self, file_filter, rules): 389 for file, rule_list in zip(self.input_files, self.rules_by_file): 390 if file_filter.match(file): 391 rule_list += rules 392 393 def make_requests(self): 394 # Map from rule list to filter files with that rule list 395 unique_rules = defaultdict(list) 396 for filter_file, rules in zip(self.filter_files, self.rules_by_file): 397 unique_rules[tuple(rules)].append(filter_file) 398 399 new_requests = [] 400 i = 0 401 for rules, filter_files in unique_rules.items(): 402 base_filter_file = filter_files[0] 403 new_requests += [ 404 PrintFileRequest( 405 name = "%s_print_%d" % (self.category, i), 406 output_file = base_filter_file, 407 content = self._generate_resource_filter_txt(rules) 408 ) 409 ] 410 i += 1 411 for filter_file in filter_files[1:]: 412 new_requests += [ 413 CopyRequest( 414 name = "%s_copy_%d" % (self.category, i), 415 input_file = base_filter_file, 416 output_file = filter_file 417 ) 418 ] 419 i += 1 420 return new_requests 421 422 @staticmethod 423 def _generate_resource_filter_txt(rules): 424 result = "# Caution: This file is automatically generated\n\n" 425 result += "\n".join(rules) 426 return result 427 428 429 def _apply_resource_filters(all_requests, config, io): 430 """Creates filters for looking within resource bundle files.""" 431 json_data = config.filters_json_data 432 if "resourceFilters" not in json_data: 433 return all_requests 434 435 collected = {} 436 for entry in json_data["resourceFilters"]: 437 if "files" in entry: 438 file_filter = Filter.create_from_json(entry["files"], io) 439 else: 440 file_filter = InclusionFilter() 441 for category in entry["categories"]: 442 # not defaultdict because we need to pass arguments to the constructor 443 if category not in collected: 444 filter_info = ResourceFilterInfo(category, config.strategy) 445 filter_info.apply_to_requests(all_requests) 446 collected[category] = filter_info 447 else: 448 filter_info = collected[category] 449 filter_info.add_rules(file_filter, entry["rules"]) 450 451 # Add the filter generation requests to the beginning so that by default 452 # they are made before genrb gets run (order is required by windirect) 453 new_requests = [] 454 for filter_info in collected.values(): 455 new_requests += filter_info.make_requests() 456 new_requests += all_requests 457 return new_requests