archive.py (5951B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 import bz2 6 import gzip 7 import stat 8 import tarfile 9 10 from .files import BaseFile, File 11 12 # 2016-01-01T00:00:00+0000 13 DEFAULT_MTIME = 1451606400 14 15 16 # Python 3.9 contains this change: 17 # https://github.com/python/cpython/commit/674935b8caf33e47c78f1b8e197b1b77a04992d2 18 # which changes the output of tar creation compared to earlier versions. 19 # As this code is used to generate tar files that are meant to be deterministic 20 # across versions of python (specifically, it's used as part of computing the hash 21 # of docker images, which needs to be identical between CI (which uses python 3.8), 22 # and developer environments (using arbitrary versions of python, at this point, 23 # most probably more recent than 3.9)). 24 # What we do is subblass TarInfo so that if used on python >= 3.9, it reproduces the 25 # behavior from python < 3.9. 26 # Here's how it goes: 27 # - the behavior in python >= 3.9 is the same as python < 3.9 when the type encoded 28 # in the tarinfo is CHRTYPE or BLKTYPE. 29 # - the value of the type is only compared in the context of choosing which behavior 30 # to take 31 # - we replace the type with the same value (so that using the value has no changes) 32 # but that pretends to be the same as CHRTYPE so that the condition that enables the 33 # old behavior is taken. 34 class HackedType(bytes): 35 def __eq__(self, other): 36 if other == tarfile.CHRTYPE: 37 return True 38 return self == other 39 40 41 class TarInfo(tarfile.TarInfo): 42 @staticmethod 43 def _create_header(info, format, encoding, errors): 44 info["type"] = HackedType(info["type"]) 45 return tarfile.TarInfo._create_header(info, format, encoding, errors) 46 47 48 def create_tar_from_files(fp, files): 49 """Create a tar file deterministically. 50 51 Receives a dict mapping names of files in the archive to local filesystem 52 paths or ``mozpack.files.BaseFile`` instances. 53 54 The files will be archived and written to the passed file handle opened 55 for writing. 56 57 Only regular files can be written. 58 59 FUTURE accept a filename argument (or create APIs to write files) 60 """ 61 # The format is explicitly set to tarfile.GNU_FORMAT, because this default format 62 # has been changed in Python 3.8. 63 with tarfile.open( 64 name="", mode="w", fileobj=fp, dereference=True, format=tarfile.GNU_FORMAT 65 ) as tf: 66 for archive_path, f in sorted(files.items()): 67 if not isinstance(f, BaseFile): 68 f = File(f) 69 70 ti = TarInfo(archive_path) 71 ti.mode = f.mode or 0o0644 72 ti.type = tarfile.REGTYPE 73 74 if not ti.isreg(): 75 raise ValueError("not a regular file: %s" % f) 76 77 # Disallow setuid and setgid bits. This is an arbitrary restriction. 78 # However, since we set uid/gid to root:root, setuid and setgid 79 # would be a glaring security hole if the archive were 80 # uncompressed as root. 81 if ti.mode & (stat.S_ISUID | stat.S_ISGID): 82 raise ValueError("cannot add file with setuid or setgid set: %s" % f) 83 84 # Set uid, gid, username, and group as deterministic values. 85 ti.uid = 0 86 ti.gid = 0 87 ti.uname = "" 88 ti.gname = "" 89 90 # Set mtime to a constant value. 91 ti.mtime = DEFAULT_MTIME 92 93 ti.size = f.size() 94 # tarfile wants to pass a size argument to read(). So just 95 # wrap/buffer in a proper file object interface. 96 tf.addfile(ti, f.open()) 97 98 99 def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9): 100 """Create a tar.gz file deterministically from files. 101 102 This is a glorified wrapper around ``create_tar_from_files`` that 103 adds gzip compression. 104 105 The passed file handle should be opened for writing in binary mode. 106 When the function returns, all data has been written to the handle. 107 """ 108 # Offset 3-7 in the gzip header contains an mtime. Pin it to a known 109 # value so output is deterministic. 110 gf = gzip.GzipFile( 111 filename=filename or "", 112 mode="wb", 113 fileobj=fp, 114 compresslevel=compresslevel, 115 mtime=DEFAULT_MTIME, 116 ) 117 with gf: 118 create_tar_from_files(gf, files) 119 120 121 def create_tar_zst_from_files(fp, files, filename=None, compresslevel=9, threads=1): 122 """Create a tar.zst file deterministically from files. 123 124 This is a glorified wrapper around ``create_tar_from_files`` that 125 adds zstandard compression. 126 127 The passed file handle should be opened for writing in binary mode. 128 When the function returns, all data has been written to the handle. 129 """ 130 import zstandard 131 132 cctx = zstandard.ZstdCompressor(level=compresslevel, threads=threads) 133 with cctx.stream_writer(writer=fp) as compressor: 134 create_tar_from_files(compressor, files) 135 136 137 class _BZ2Proxy: 138 """File object that proxies writes to a bz2 compressor.""" 139 140 def __init__(self, fp, compresslevel=9): 141 self.fp = fp 142 self.compressor = bz2.BZ2Compressor(compresslevel) 143 self.pos = 0 144 145 def tell(self): 146 return self.pos 147 148 def write(self, data): 149 data = self.compressor.compress(data) 150 self.pos += len(data) 151 self.fp.write(data) 152 153 def close(self): 154 data = self.compressor.flush() 155 self.pos += len(data) 156 self.fp.write(data) 157 158 159 def create_tar_bz2_from_files(fp, files, compresslevel=9): 160 """Create a tar.bz2 file deterministically from files. 161 162 This is a glorified wrapper around ``create_tar_from_files`` that 163 adds bzip2 compression. 164 165 This function is similar to ``create_tar_gzip_from_files()``. 166 """ 167 proxy = _BZ2Proxy(fp, compresslevel=compresslevel) 168 create_tar_from_files(proxy, files) 169 proxy.close()