funsize.py (16538B)
1 #!/usr/bin/env python3 2 # This Source Code Form is subject to the terms of the Mozilla Public 3 # License, v. 2.0. If a copy of the MPL was not distributed with this 4 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 5 6 import argparse 7 import asyncio 8 import configparser 9 import json 10 import logging 11 import os 12 import shutil 13 import tempfile 14 import time 15 from contextlib import AsyncExitStack 16 from pathlib import Path 17 18 import aiohttp 19 from mardor.reader import MarReader 20 from mardor.signing import get_keysize 21 from scriptworker.utils import get_hash, retry_async 22 23 log = logging.getLogger(__name__) 24 25 26 ROOT_URL = os.environ.get( 27 "TASKCLUSTER_ROOT_URL", "https://firefox-ci-tc.services.mozilla.com" 28 ) 29 QUEUE_PREFIX = f"{ROOT_URL}/api/queue/" 30 ALLOWED_URL_PREFIXES = ( 31 "http://download.cdn.mozilla.net/pub/mozilla.org/firefox/nightly/", 32 "http://download.cdn.mozilla.net/pub/firefox/nightly/", 33 "http://ftp.mozilla.org/", 34 "http://download.mozilla.org/", 35 "https://archive.mozilla.org/", 36 "http://archive.mozilla.org/", 37 QUEUE_PREFIX, 38 ) 39 STAGING_URL_PREFIXES = ( 40 "http://ftp.stage.mozaws.net/", 41 "https://ftp.stage.mozaws.net/", 42 ) 43 44 BCJ_OPTIONS = { 45 "x86": ["--x86"], 46 "x86_64": ["--x86"], 47 "aarch64": [], 48 # macOS Universal Builds 49 "macos-x86_64-aarch64": [], 50 } 51 52 53 def strtobool(value: str): 54 # Copied from `mach.util` since this script runs outside of a mach environment 55 # Reimplementation of distutils.util.strtobool 56 # https://docs.python.org/3.9/distutils/apiref.html#distutils.util.strtobool 57 true_vals = ("y", "yes", "t", "true", "on", "1") 58 false_vals = ("n", "no", "f", "false", "off", "0") 59 60 value = value.lower() 61 if value in true_vals: 62 return 1 63 if value in false_vals: 64 return 0 65 66 raise ValueError(f"Expected one of: {', '.join(true_vals + false_vals)}") 67 68 69 def verify_signature(mar, cert): 70 log.info("Checking %s signature", mar) 71 with open(mar, "rb") as mar_fh: 72 m = MarReader(mar_fh) 73 if not m.verify(verify_key=cert): 74 raise ValueError( 75 "MAR Signature invalid: %s (%s) against %s", mar, m.signature_type, cert 76 ) 77 78 79 def process_arguments(): 80 parser = argparse.ArgumentParser() 81 parser.add_argument("--artifacts-dir", required=True) 82 parser.add_argument("--signing-cert", type=argparse.FileType("rb"), required=True) 83 parser.add_argument("--task-definition", required=True, type=argparse.FileType("r")) 84 parser.add_argument( 85 "--allow-staging-prefixes", 86 action="store_true", 87 default=strtobool(os.environ.get("FUNSIZE_ALLOW_STAGING_PREFIXES", "false")), 88 help="Allow files from staging buckets.", 89 ) 90 parser.add_argument( 91 "-q", 92 "--quiet", 93 dest="log_level", 94 action="store_const", 95 const=logging.INFO, 96 default=logging.DEBUG, 97 ) 98 parser.add_argument( 99 "--arch", 100 type=str, 101 required=True, 102 choices=BCJ_OPTIONS.keys(), 103 help="The archtecture you are building.", 104 ) 105 return parser.parse_args() 106 107 108 def validate_mar_channel_id(mar, channel_ids): 109 log.info("Checking %s for MAR_CHANNEL_ID %s", mar, channel_ids) 110 # We may get a string with a list representation, or a single entry string. 111 channel_ids = set(channel_ids.split(",")) 112 113 product_info = MarReader(open(mar, "rb")).productinfo 114 if not isinstance(product_info, tuple): 115 raise ValueError(f"Malformed product information in mar: {product_info}") 116 117 found_channel_ids = set(product_info[1].split(",")) 118 119 if not found_channel_ids.issubset(channel_ids): 120 raise ValueError( 121 f"MAR_CHANNEL_ID mismatch, {product_info[1]} not in {channel_ids}" 122 ) 123 124 log.info("%s channel %s in %s", mar, product_info[1], channel_ids) 125 126 127 async def retry_download(*args, semaphore=None, **kwargs): # noqa: E999 128 """Retry download() calls.""" 129 async with AsyncExitStack() as stack: 130 if semaphore: 131 await stack.enter_async_context(semaphore) 132 await retry_async( 133 download, 134 retry_exceptions=(aiohttp.ClientError, asyncio.TimeoutError), 135 args=args, 136 kwargs=kwargs, 137 ) 138 139 140 def verify_allowed_url(mar, allowed_url_prefixes): 141 if not any(mar.startswith(prefix) for prefix in allowed_url_prefixes): 142 raise ValueError( 143 f"{mar} is not in allowed URL prefixes: {allowed_url_prefixes}" 144 ) 145 146 147 async def download(url, dest, mode=None): # noqa: E999 148 log.info("Downloading %s to %s", url, dest) 149 chunk_size = 4096 150 bytes_downloaded = 0 151 async with aiohttp.ClientSession(raise_for_status=True) as session: 152 start = time.time() 153 async with session.get(url, timeout=120) as resp: 154 # Additional early logging for download timeouts. 155 log.debug("Fetching from url %s", resp.url) 156 for history in resp.history: 157 log.debug("Redirection history: %s", history.url) 158 log.debug("Headers for %s: %s", resp.url, resp.headers) 159 if "Content-Length" in resp.headers: 160 log.debug( 161 "Content-Length expected for %s: %s", 162 url, 163 resp.headers["Content-Length"], 164 ) 165 log_interval = chunk_size * 1024 166 with open(dest, "wb") as fd: 167 while True: 168 chunk = await resp.content.read(chunk_size) 169 if not chunk: 170 break 171 fd.write(chunk) 172 bytes_downloaded += len(chunk) 173 log_interval -= len(chunk) 174 if log_interval <= 0: 175 log.debug("Bytes downloaded for %s: %d", url, bytes_downloaded) 176 log_interval = chunk_size * 1024 177 end = time.time() 178 log.info( 179 "Downloaded %s, %s bytes in %s seconds: sha256:%s", 180 url, 181 bytes_downloaded, 182 int(end - start), 183 get_hash(dest, hash_alg="sha256"), 184 ) 185 if mode: 186 log.info("chmod %o %s", mode, dest) 187 os.chmod(dest, mode) 188 189 190 async def download_buildsystem_bits(partials_config, downloads, tools_dir): 191 """Download external tools needed to make partials.""" 192 193 # We're making the assumption that the "to" mar is the same for all, 194 # as that's the way this task is currently used. 195 to_url = extract_download_urls(partials_config, mar_type="to").pop() 196 197 repo = get_option( 198 downloads[to_url]["extracted_path"], 199 filename="platform.ini", 200 section="Build", 201 option="SourceRepository", 202 ) 203 revision = get_option( 204 downloads[to_url]["extracted_path"], 205 filename="platform.ini", 206 section="Build", 207 option="SourceStamp", 208 ) 209 210 urls = { 211 "make_incremental_update.sh": f"{repo}/raw-file/{revision}/tools/" 212 "update-packaging/make_incremental_update.sh", 213 "common.sh": f"{repo}/raw-file/{revision}/tools/update-packaging/common.sh", 214 "mar": "https://archive.mozilla.org/pub/mozilla.org/firefox/nightly/" 215 "latest-mozilla-central/mar-tools/linux64/mar", 216 "mbsdiff": "https://archive.mozilla.org/pub/mozilla.org/firefox/nightly/" 217 "latest-mozilla-central/mar-tools/linux64/mbsdiff", 218 } 219 for filename, url in urls.items(): 220 filename = tools_dir / filename 221 await retry_download(url, dest=filename, mode=0o755) 222 223 224 def find_file(directory, filename): 225 log.debug("Searching for %s in %s", filename, directory) 226 return next(Path(directory).rglob(filename)) 227 228 229 def get_option(directory, filename, section, option): 230 log.info("Extracting [%s]: %s from %s/**/%s", section, option, directory, filename) 231 f = find_file(directory, filename) 232 config = configparser.ConfigParser() 233 config.read(f) 234 rv = config.get(section, option) 235 log.info("Found %s", rv) 236 return rv 237 238 239 def extract_download_urls(partials_config, mar_type): 240 """Extract a set of urls to download from the task configuration. 241 242 mar_type should be one of "from", "to" 243 """ 244 return {definition[f"{mar_type}_mar"] for definition in partials_config} 245 246 247 async def download_and_verify_mars(partials_config, allowed_url_prefixes, signing_cert): 248 """Download, check signature, channel ID and unpack MAR files.""" 249 # Separate these categories so we can opt to perform checks on only 'to' downloads. 250 from_urls = extract_download_urls(partials_config, mar_type="from") 251 to_urls = extract_download_urls(partials_config, mar_type="to") 252 tasks = list() 253 downloads = dict() 254 255 semaphore = asyncio.Semaphore(2) # Magic 2 to reduce network timeout errors. 256 for url in from_urls.union(to_urls): 257 verify_allowed_url(url, allowed_url_prefixes) 258 downloads[url] = { 259 "download_path": Path(tempfile.mkdtemp()) / Path(url).name, 260 } 261 tasks.append( 262 retry_download(url, downloads[url]["download_path"], semaphore=semaphore) 263 ) 264 265 await asyncio.gather(*tasks) 266 267 for url in downloads: 268 # Verify signature, but not from an artifact as we don't 269 # depend on the signing task 270 if not os.getenv("MOZ_DISABLE_MAR_CERT_VERIFICATION") and not url.startswith( 271 QUEUE_PREFIX 272 ): 273 verify_signature(downloads[url]["download_path"], signing_cert) 274 275 # Only validate the target channel ID, as we update from beta->release 276 if url in to_urls: 277 validate_mar_channel_id( 278 downloads[url]["download_path"], os.environ["MAR_CHANNEL_ID"] 279 ) 280 281 downloads[url]["extracted_path"] = tempfile.mkdtemp() 282 with open(downloads[url]["download_path"], "rb") as mar_fh: 283 log.info( 284 "Unpacking %s into %s", 285 downloads[url]["download_path"], 286 downloads[url]["extracted_path"], 287 ) 288 m = MarReader(mar_fh) 289 m.extract(downloads[url]["extracted_path"]) 290 291 return downloads 292 293 294 async def run_command(cmd, cwd="/", env=None, label=None, silent=False): 295 log.info("Running: %s", cmd) 296 if not env: 297 env = dict() 298 process = await asyncio.create_subprocess_shell( 299 cmd, 300 stdout=asyncio.subprocess.PIPE, 301 stderr=asyncio.subprocess.PIPE, 302 cwd=cwd, 303 env=env, 304 ) 305 if label: 306 label = f"{label}: " 307 else: 308 label = "" 309 310 async def read_output(stream, label, printcmd): 311 while True: 312 line = await stream.readline() 313 if line == b"": 314 break 315 printcmd("%s%s", label, line.decode("utf-8").rstrip()) 316 317 if silent: 318 await process.wait() 319 else: 320 await asyncio.gather( 321 read_output(process.stdout, label, log.info), 322 read_output(process.stderr, label, log.warning), 323 ) 324 await process.wait() 325 326 327 async def generate_partial(from_dir, to_dir, dest_mar, mar_data, tools_dir, arch): 328 log.info("Generating partial %s", dest_mar) 329 env = os.environ.copy() 330 env["LC_ALL"] = "C" 331 env["MAR"] = tools_dir / "mar" 332 env["MBSDIFF"] = tools_dir / "mbsdiff" 333 if arch: 334 env["BCJ_OPTIONS"] = " ".join(BCJ_OPTIONS[arch]) 335 env["MOZ_PRODUCT_VERSION"] = mar_data["version"] 336 env["MAR_CHANNEL_ID"] = mar_data["MAR_CHANNEL_ID"] 337 env["BRANCH"] = mar_data["branch"] 338 339 make_incremental_update = tools_dir / "make_incremental_update.sh" 340 cmd = f"{make_incremental_update} {dest_mar} {from_dir} {to_dir}" 341 342 await run_command(cmd, cwd=dest_mar.parent, env=env, label=dest_mar.name) 343 validate_mar_channel_id(dest_mar, mar_data["MAR_CHANNEL_ID"]) 344 345 346 async def manage_partial( 347 partial_def, artifacts_dir, tools_dir, downloads, semaphore, arch=None 348 ): 349 from_url = partial_def["from_mar"] 350 to_url = partial_def["to_mar"] 351 from_path = downloads[from_url]["extracted_path"] 352 to_path = downloads[to_url]["extracted_path"] 353 354 mar_data = { 355 "MAR_CHANNEL_ID": os.environ["MAR_CHANNEL_ID"], 356 "version": get_option( 357 to_path, filename="application.ini", section="App", option="Version" 358 ), 359 "appName": get_option( 360 from_path, filename="application.ini", section="App", option="Name" 361 ), 362 # Use Gecko repo and rev from platform.ini, not application.ini 363 "repo": get_option( 364 to_path, filename="platform.ini", section="Build", option="SourceRepository" 365 ), 366 "revision": get_option( 367 to_path, filename="platform.ini", section="Build", option="SourceStamp" 368 ), 369 "locale": partial_def["locale"], 370 "from_mar": partial_def["from_mar"], 371 "from_size": os.path.getsize(downloads[from_url]["download_path"]), 372 "from_hash": get_hash(downloads[from_url]["download_path"], hash_alg="sha512"), 373 "from_buildid": get_option( 374 from_path, filename="application.ini", section="App", option="BuildID" 375 ), 376 "to_mar": partial_def["to_mar"], 377 "to_size": os.path.getsize(downloads[to_url]["download_path"]), 378 "to_hash": get_hash(downloads[to_url]["download_path"], hash_alg="sha512"), 379 "to_buildid": get_option( 380 to_path, filename="application.ini", section="App", option="BuildID" 381 ), 382 "mar": partial_def["dest_mar"], 383 } 384 # if branch not set explicitly use repo-name 385 mar_data["branch"] = partial_def.get("branch", Path(mar_data["repo"]).name) 386 387 for field in ( 388 "update_number", 389 "previousVersion", 390 "previousBuildNumber", 391 "toVersion", 392 "toBuildNumber", 393 ): 394 if field in partial_def: 395 mar_data[field] = partial_def[field] 396 397 dest_mar = Path(artifacts_dir) / mar_data["mar"] 398 399 async with semaphore: 400 await generate_partial(from_path, to_path, dest_mar, mar_data, tools_dir, arch) 401 402 mar_data["size"] = os.path.getsize(dest_mar) 403 mar_data["hash"] = get_hash(dest_mar, hash_alg="sha512") 404 return mar_data 405 406 407 async def async_main(args, signing_cert): 408 tasks = [] 409 410 allowed_url_prefixes = list(ALLOWED_URL_PREFIXES) 411 if args.allow_staging_prefixes: 412 allowed_url_prefixes += STAGING_URL_PREFIXES 413 414 task = json.load(args.task_definition) 415 416 downloads = await download_and_verify_mars( 417 task["extra"]["funsize"]["partials"], allowed_url_prefixes, signing_cert 418 ) 419 420 tools_dir = Path(tempfile.mkdtemp()) 421 await download_buildsystem_bits( 422 partials_config=task["extra"]["funsize"]["partials"], 423 downloads=downloads, 424 tools_dir=tools_dir, 425 ) 426 427 # May want to consider os.cpu_count() if we ever run on osx/win. 428 # sched_getaffinity is the list of cores we can run on, not the total. 429 semaphore = asyncio.Semaphore(len(os.sched_getaffinity(0))) 430 for definition in task["extra"]["funsize"]["partials"]: 431 tasks.append( 432 asyncio.ensure_future( 433 retry_async( 434 manage_partial, 435 retry_exceptions=(aiohttp.ClientError, asyncio.TimeoutError), 436 kwargs=dict( 437 partial_def=definition, 438 artifacts_dir=args.artifacts_dir, 439 tools_dir=tools_dir, 440 arch=args.arch, 441 downloads=downloads, 442 semaphore=semaphore, 443 ), 444 ) 445 ) 446 ) 447 manifest = await asyncio.gather(*tasks) 448 449 for url in downloads: 450 downloads[url]["download_path"].unlink() 451 shutil.rmtree(downloads[url]["extracted_path"]) 452 shutil.rmtree(tools_dir) 453 454 return manifest 455 456 457 def main(): 458 args = process_arguments() 459 460 logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") 461 log.setLevel(args.log_level) 462 463 signing_cert = args.signing_cert.read() 464 assert get_keysize(signing_cert) == 4096 465 466 artifacts_dir = Path(args.artifacts_dir) 467 if not artifacts_dir.exists(): 468 artifacts_dir.mkdir() 469 470 loop = asyncio.get_event_loop() 471 manifest = loop.run_until_complete(async_main(args, signing_cert)) 472 loop.close() 473 474 manifest_file = artifacts_dir / "manifest.json" 475 with open(manifest_file, "w") as fp: 476 json.dump(manifest, fp, indent=2, sort_keys=True) 477 478 log.debug(f"{json.dumps(manifest, indent=2, sort_keys=True)}") 479 480 481 if __name__ == "__main__": 482 main()