tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

md5_check.py (15881B)


      1 # Copyright 2013 The Chromium Authors
      2 # Use of this source code is governed by a BSD-style license that can be
      3 # found in the LICENSE file.
      4 
      5 
      6 import difflib
      7 import hashlib
      8 import itertools
      9 import json
     10 import os
     11 import sys
     12 import zipfile
     13 
     14 from util import build_utils
     15 import action_helpers  # build_utils adds //build to sys.path.
     16 import print_python_deps
     17 
     18 # When set and a difference is detected, a diff of what changed is printed.
     19 PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
     20 
     21 # An escape hatch that causes all targets to be rebuilt.
     22 _FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
     23 
     24 
     25 def CallAndWriteDepfileIfStale(on_stale_md5,
     26                               options,
     27                               record_path=None,
     28                               input_paths=None,
     29                               input_strings=None,
     30                               output_paths=None,
     31                               force=False,
     32                               pass_changes=False,
     33                               track_subpaths_allowlist=None,
     34                               depfile_deps=None):
     35  """Wraps CallAndRecordIfStale() and writes a depfile if applicable.
     36 
     37  Depfiles are automatically added to output_paths when present in the |options|
     38  argument. They are then created after |on_stale_md5| is called.
     39 
     40  By default, only python dependencies are added to the depfile. If there are
     41  other input paths that are not captured by GN deps, then they should be listed
     42  in depfile_deps. It's important to write paths to the depfile that are already
     43  captured by GN deps since GN args can cause GN deps to change, and such
     44  changes are not immediately reflected in depfiles (http://crbug.com/589311).
     45  """
     46  if not output_paths:
     47    raise Exception('At least one output_path must be specified.')
     48  input_paths = list(input_paths or [])
     49  input_strings = list(input_strings or [])
     50  output_paths = list(output_paths or [])
     51 
     52  input_paths += print_python_deps.ComputePythonDependencies()
     53 
     54  CallAndRecordIfStale(
     55      on_stale_md5,
     56      record_path=record_path,
     57      input_paths=input_paths,
     58      input_strings=input_strings,
     59      output_paths=output_paths,
     60      force=force,
     61      pass_changes=pass_changes,
     62      track_subpaths_allowlist=track_subpaths_allowlist)
     63 
     64  # Write depfile even when inputs have not changed to ensure build correctness
     65  # on bots that build with & without patch, and the patch changes the depfile
     66  # location.
     67  if hasattr(options, 'depfile') and options.depfile:
     68    action_helpers.write_depfile(options.depfile, output_paths[0], depfile_deps)
     69 
     70 
     71 def CallAndRecordIfStale(function,
     72                         record_path=None,
     73                         input_paths=None,
     74                         input_strings=None,
     75                         output_paths=None,
     76                         force=False,
     77                         pass_changes=False,
     78                         track_subpaths_allowlist=None):
     79  """Calls function if outputs are stale.
     80 
     81  Outputs are considered stale if:
     82  - any output_paths are missing, or
     83  - the contents of any file within input_paths has changed, or
     84  - the contents of input_strings has changed.
     85 
     86  To debug which files are out-of-date, set the environment variable:
     87      PRINT_MD5_DIFFS=1
     88 
     89  Args:
     90    function: The function to call.
     91    record_path: Path to record metadata.
     92      Defaults to output_paths[0] + '.md5.stamp'
     93    input_paths: List of paths to calcualte an md5 sum on.
     94    input_strings: List of strings to record verbatim.
     95    output_paths: List of output paths.
     96    force: Whether to treat outputs as missing regardless of whether they
     97      actually are.
     98    pass_changes: Whether to pass a Changes instance to |function|.
     99    track_subpaths_allowlist: Relevant only when pass_changes=True. List of .zip
    100      files from |input_paths| to make subpath information available for.
    101  """
    102  assert record_path or output_paths
    103  input_paths = input_paths or []
    104  input_strings = input_strings or []
    105  output_paths = output_paths or []
    106  record_path = record_path or output_paths[0] + '.md5.stamp'
    107 
    108  assert record_path.endswith('.stamp'), (
    109      'record paths must end in \'.stamp\' so that they are easy to find '
    110      'and delete')
    111 
    112  new_metadata = _Metadata(track_entries=pass_changes or PRINT_EXPLANATIONS)
    113  new_metadata.AddStrings(input_strings)
    114 
    115  zip_allowlist = set(track_subpaths_allowlist or [])
    116  for path in input_paths:
    117    if os.path.isabs(path):
    118      path = os.path.relpath(path)
    119    # It's faster to md5 an entire zip file than it is to just locate & hash
    120    # its central directory (which is what this used to do).
    121    if path in zip_allowlist:
    122      entries = _ExtractZipEntries(path)
    123      new_metadata.AddZipFile(path, entries)
    124    else:
    125      new_metadata.AddFile(path, _ComputeTagForPath(path))
    126 
    127  force = force or _FORCE_REBUILD
    128  missing_outputs = [x for x in output_paths if force or not os.path.exists(x)]
    129  old_metadata = None
    130 
    131  if not missing_outputs and os.path.exists(record_path):
    132    with open(record_path, 'r') as jsonfile:
    133      try:
    134        old_metadata = _Metadata.FromFile(jsonfile)
    135      except:  # pylint: disable=bare-except
    136        pass  # Not yet using new file format.
    137 
    138  changes = Changes(old_metadata, new_metadata, force, missing_outputs)
    139  if not changes.HasChanges():
    140    return
    141 
    142  if PRINT_EXPLANATIONS:
    143    print('=' * 80)
    144    print('Target is stale: %s' % record_path)
    145    print(changes.DescribeDifference())
    146    print('=' * 80)
    147 
    148  args = (changes,) if pass_changes else ()
    149  function(*args)
    150 
    151  with open(record_path, 'w') as f:
    152    new_metadata.ToFile(f)
    153 
    154 
    155 class Changes:
    156  """Provides and API for querying what changed between runs."""
    157 
    158  def __init__(self, old_metadata, new_metadata, force, missing_outputs):
    159    self.old_metadata = old_metadata
    160    self.new_metadata = new_metadata
    161    self.force = force
    162    self.missing_outputs = missing_outputs
    163 
    164  def _GetOldTag(self, path, subpath=None):
    165    return self.old_metadata and self.old_metadata.GetTag(path, subpath)
    166 
    167  def HasChanges(self):
    168    """Returns whether any changes exist."""
    169    return (self.HasStringChanges()
    170            or self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5())
    171 
    172  def HasStringChanges(self):
    173    """Returns whether string metadata changed."""
    174    return (self.force or not self.old_metadata
    175            or self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5())
    176 
    177  def AddedOrModifiedOnly(self):
    178    """Returns whether the only changes were from added or modified (sub)files.
    179 
    180    No missing outputs, no removed paths/subpaths.
    181    """
    182    if self.HasStringChanges():
    183      return False
    184    if any(self.IterRemovedPaths()):
    185      return False
    186    for path in self.IterModifiedPaths():
    187      if any(self.IterRemovedSubpaths(path)):
    188        return False
    189    return True
    190 
    191  def IterAllPaths(self):
    192    """Generator for paths."""
    193    return self.new_metadata.IterPaths();
    194 
    195  def IterAllSubpaths(self, path):
    196    """Generator for subpaths."""
    197    return self.new_metadata.IterSubpaths(path);
    198 
    199  def IterAddedPaths(self):
    200    """Generator for paths that were added."""
    201    for path in self.new_metadata.IterPaths():
    202      if self._GetOldTag(path) is None:
    203        yield path
    204 
    205  def IterAddedSubpaths(self, path):
    206    """Generator for paths that were added within the given zip file."""
    207    for subpath in self.new_metadata.IterSubpaths(path):
    208      if self._GetOldTag(path, subpath) is None:
    209        yield subpath
    210 
    211  def IterRemovedPaths(self):
    212    """Generator for paths that were removed."""
    213    if self.old_metadata:
    214      for path in self.old_metadata.IterPaths():
    215        if self.new_metadata.GetTag(path) is None:
    216          yield path
    217 
    218  def IterRemovedSubpaths(self, path):
    219    """Generator for paths that were removed within the given zip file."""
    220    if self.old_metadata:
    221      for subpath in self.old_metadata.IterSubpaths(path):
    222        if self.new_metadata.GetTag(path, subpath) is None:
    223          yield subpath
    224 
    225  def IterModifiedPaths(self):
    226    """Generator for paths whose contents have changed."""
    227    for path in self.new_metadata.IterPaths():
    228      old_tag = self._GetOldTag(path)
    229      new_tag = self.new_metadata.GetTag(path)
    230      if old_tag is not None and old_tag != new_tag:
    231        yield path
    232 
    233  def IterModifiedSubpaths(self, path):
    234    """Generator for paths within a zip file whose contents have changed."""
    235    for subpath in self.new_metadata.IterSubpaths(path):
    236      old_tag = self._GetOldTag(path, subpath)
    237      new_tag = self.new_metadata.GetTag(path, subpath)
    238      if old_tag is not None and old_tag != new_tag:
    239        yield subpath
    240 
    241  def IterChangedPaths(self):
    242    """Generator for all changed paths (added/removed/modified)."""
    243    return itertools.chain(self.IterRemovedPaths(),
    244                           self.IterModifiedPaths(),
    245                           self.IterAddedPaths())
    246 
    247  def IterChangedSubpaths(self, path):
    248    """Generator for paths within a zip that were added/removed/modified."""
    249    return itertools.chain(self.IterRemovedSubpaths(path),
    250                           self.IterModifiedSubpaths(path),
    251                           self.IterAddedSubpaths(path))
    252 
    253  def DescribeDifference(self):
    254    """Returns a human-readable description of what changed."""
    255    if self.force:
    256      return 'force=True'
    257    if self.missing_outputs:
    258      return 'Outputs do not exist:\n  ' + '\n  '.join(self.missing_outputs)
    259    if self.old_metadata is None:
    260      return 'Previous stamp file not found.'
    261 
    262    if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5():
    263      ndiff = difflib.ndiff(self.old_metadata.GetStrings(),
    264                            self.new_metadata.GetStrings())
    265      changed = [s for s in ndiff if not s.startswith(' ')]
    266      return 'Input strings changed:\n  ' + '\n  '.join(changed)
    267 
    268    if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5():
    269      return "There's no difference."
    270 
    271    lines = []
    272    lines.extend('Added: ' + p for p in self.IterAddedPaths())
    273    lines.extend('Removed: ' + p for p in self.IterRemovedPaths())
    274    for path in self.IterModifiedPaths():
    275      lines.append('Modified: ' + path)
    276      lines.extend('  -> Subpath added: ' + p
    277                   for p in self.IterAddedSubpaths(path))
    278      lines.extend('  -> Subpath removed: ' + p
    279                   for p in self.IterRemovedSubpaths(path))
    280      lines.extend('  -> Subpath modified: ' + p
    281                   for p in self.IterModifiedSubpaths(path))
    282    if lines:
    283      return 'Input files changed:\n  ' + '\n  '.join(lines)
    284    return 'I have no idea what changed (there is a bug).'
    285 
    286 
    287 class _Metadata:
    288  """Data model for tracking change metadata.
    289 
    290  Args:
    291    track_entries: Enables per-file change tracking. Slower, but required for
    292        Changes functionality.
    293  """
    294  # Schema:
    295  # {
    296  #   "files-md5": "VALUE",
    297  #   "strings-md5": "VALUE",
    298  #   "input-files": [
    299  #     {
    300  #       "path": "path.jar",
    301  #       "tag": "{MD5 of entries}",
    302  #       "entries": [
    303  #         { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ...
    304  #       ]
    305  #     }, {
    306  #       "path": "path.txt",
    307  #       "tag": "{MD5}",
    308  #     }
    309  #   ],
    310  #   "input-strings": ["a", "b", ...],
    311  # }
    312  def __init__(self, track_entries=False):
    313    self._track_entries = track_entries
    314    self._files_md5 = None
    315    self._strings_md5 = None
    316    self._files = []
    317    self._strings = []
    318    # Map of (path, subpath) -> entry. Created upon first call to _GetEntry().
    319    self._file_map = None
    320 
    321  @classmethod
    322  def FromFile(cls, fileobj):
    323    """Returns a _Metadata initialized from a file object."""
    324    ret = cls()
    325    obj = json.load(fileobj)
    326    ret._files_md5 = obj['files-md5']
    327    ret._strings_md5 = obj['strings-md5']
    328    ret._files = obj.get('input-files', [])
    329    ret._strings = obj.get('input-strings', [])
    330    return ret
    331 
    332  def ToFile(self, fileobj):
    333    """Serializes metadata to the given file object."""
    334    obj = {
    335        'files-md5': self.FilesMd5(),
    336        'strings-md5': self.StringsMd5(),
    337    }
    338    if self._track_entries:
    339      obj['input-files'] = sorted(self._files, key=lambda e: e['path'])
    340      obj['input-strings'] = self._strings
    341 
    342    json.dump(obj, fileobj, indent=2)
    343 
    344  def _AssertNotQueried(self):
    345    assert self._files_md5 is None
    346    assert self._strings_md5 is None
    347    assert self._file_map is None
    348 
    349  def AddStrings(self, values):
    350    self._AssertNotQueried()
    351    self._strings.extend(str(v) for v in values)
    352 
    353  def AddFile(self, path, tag):
    354    """Adds metadata for a non-zip file.
    355 
    356    Args:
    357      path: Path to the file.
    358      tag: A short string representative of the file contents.
    359    """
    360    self._AssertNotQueried()
    361    self._files.append({
    362        'path': path,
    363        'tag': tag,
    364    })
    365 
    366  def AddZipFile(self, path, entries):
    367    """Adds metadata for a zip file.
    368 
    369    Args:
    370      path: Path to the file.
    371      entries: List of (subpath, tag) tuples for entries within the zip.
    372    """
    373    self._AssertNotQueried()
    374    tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries),
    375                                            (e[1] for e in entries)))
    376    self._files.append({
    377        'path': path,
    378        'tag': tag,
    379        'entries': [{"path": e[0], "tag": e[1]} for e in entries],
    380    })
    381 
    382  def GetStrings(self):
    383    """Returns the list of input strings."""
    384    return self._strings
    385 
    386  def FilesMd5(self):
    387    """Lazily computes and returns the aggregate md5 of input files."""
    388    if self._files_md5 is None:
    389      # Omit paths from md5 since temporary files have random names.
    390      self._files_md5 = _ComputeInlineMd5(
    391          self.GetTag(p) for p in sorted(self.IterPaths()))
    392    return self._files_md5
    393 
    394  def StringsMd5(self):
    395    """Lazily computes and returns the aggregate md5 of input strings."""
    396    if self._strings_md5 is None:
    397      self._strings_md5 = _ComputeInlineMd5(self._strings)
    398    return self._strings_md5
    399 
    400  def _GetEntry(self, path, subpath=None):
    401    """Returns the JSON entry for the given path / subpath."""
    402    if self._file_map is None:
    403      self._file_map = {}
    404      for entry in self._files:
    405        self._file_map[(entry['path'], None)] = entry
    406        for subentry in entry.get('entries', ()):
    407          self._file_map[(entry['path'], subentry['path'])] = subentry
    408    return self._file_map.get((path, subpath))
    409 
    410  def GetTag(self, path, subpath=None):
    411    """Returns the tag for the given path / subpath."""
    412    ret = self._GetEntry(path, subpath)
    413    return ret and ret['tag']
    414 
    415  def IterPaths(self):
    416    """Returns a generator for all top-level paths."""
    417    return (e['path'] for e in self._files)
    418 
    419  def IterSubpaths(self, path):
    420    """Returns a generator for all subpaths in the given zip.
    421 
    422    If the given path is not a zip file or doesn't exist, returns an empty
    423    iterable.
    424    """
    425    outer_entry = self._GetEntry(path)
    426    if not outer_entry:
    427      return ()
    428    subentries = outer_entry.get('entries', [])
    429    return (entry['path'] for entry in subentries)
    430 
    431 
    432 def _ComputeTagForPath(path):
    433  stat = os.stat(path)
    434  if stat.st_size > 1 * 1024 * 1024:
    435    # Fallback to mtime for large files so that md5_check does not take too long
    436    # to run.
    437    return stat.st_mtime
    438  md5 = hashlib.md5()
    439  with open(path, 'rb') as f:
    440    md5.update(f.read())
    441  return md5.hexdigest()
    442 
    443 
    444 def _ComputeInlineMd5(iterable):
    445  """Computes the md5 of the concatenated parameters."""
    446  md5 = hashlib.md5()
    447  for item in iterable:
    448    md5.update(str(item).encode('ascii'))
    449  return md5.hexdigest()
    450 
    451 
    452 def _ExtractZipEntries(path):
    453  """Returns a list of (path, CRC32) of all files within |path|."""
    454  entries = []
    455  with zipfile.ZipFile(path) as zip_file:
    456    for zip_info in zip_file.infolist():
    457      # Skip directories and empty files.
    458      if zip_info.CRC:
    459        entries.append(
    460            (zip_info.filename, zip_info.CRC + zip_info.compress_type))
    461  return entries