fetch-talos-pdfs.py (4126B)
1 #!/usr/bin/env python3 2 3 # This Source Code Form is subject to the terms of the Mozilla Public 4 # License, v. 2.0. If a copy of the MPL was not distributed with this 5 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 """ 8 This script downloads all the required PDFs from the test_manifest.json 9 file found in the mozilla pdf.js repo. 10 """ 11 12 import json 13 import os 14 import pathlib 15 import shutil 16 17 import requests 18 from redo import retriable 19 20 21 def log(msg): 22 print("fetch-talos-pdf: %s" % msg) 23 24 25 @retriable(attempts=7, sleeptime=5, sleepscale=2) 26 def fetch_file(url, filepath): 27 """Download a file from the given url to a given file. 28 29 :param str url: URL to download file from. 30 :param Path filepath: Location to ouput the downloaded file 31 (includes the name of the file). 32 """ 33 size = 4096 34 r = requests.get(url, stream=True) 35 r.raise_for_status() 36 37 with filepath.open("wb") as fd: 38 for chunk in r.iter_content(size): 39 fd.write(chunk) 40 41 42 def fetch_talos_pdf_link(pdf_path, output_file): 43 """Fetches a PDF file with a link into the output file location. 44 45 :param Path pdf_path: Path to a PDF file that contains a URL to download from. 46 :param Path output_file: Location (including the file name) to download PDF to. 47 """ 48 pdf_link = pdf_path.read_text().strip() 49 log(f"Downloading from PDF link: {pdf_link}") 50 fetch_file(pdf_link, output_file) 51 52 53 def gather_talos_pdf(test_folder, pdf_info, output_dir): 54 """Gathers a PDF file into the output directory. 55 56 :param Path test_folder: The test folder that the pdfs can be found in. 57 :param Path pdf_info: Information about the pdf we're currently gathering, and 58 found in the test/test_manifest.json file from the pdf.js repo. 59 :param Path output_dir: The directory to move/download the PDF to. 60 """ 61 if not pdf_info.get("talos", True): 62 return 63 pdf_file = pdf_info["file"] 64 output_pdf_path = pathlib.Path(output_dir, pathlib.Path(pdf_file).name) 65 66 log(f"Gathering PDF {pdf_file}...") 67 if output_pdf_path.exists(): 68 log(f"{pdf_file} already exists in output location") 69 elif pdf_info.get("link", False): 70 fetch_talos_pdf_link( 71 pathlib.Path(test_folder, pdf_file + ".link"), output_pdf_path 72 ) 73 else: 74 log(f"Copying PDF to output location {output_pdf_path}") 75 shutil.copy(pathlib.Path(test_folder, pdf_file), output_pdf_path) 76 77 78 def gather_talos_pdfs(pdf_js_repo, output_dir): 79 """Gather all pdfs to be used in the talos pdfpaint test. 80 81 Uses the pdf.js repo to gather the files from it's test/test_manifest.json 82 file. Some of these are also links that need to be downloaded. These 83 are output in an output directory. 84 85 :param Path pdf_js_repo: Path to the Mozilla Github pdf.js repo. 86 :param Path output_dir: Output directory for the PDFs. 87 """ 88 test_manifest_path = pathlib.Path( 89 pdf_js_repo, "test", "test_manifest.json" 90 ).resolve() 91 test_folder = test_manifest_path.parent 92 93 # Gather all the PDFs into the output directory 94 test_manifest = json.loads(test_manifest_path.read_text()) 95 for pdf_info in test_manifest: 96 gather_talos_pdf(test_folder, pdf_info, output_dir) 97 98 # Include the test manifest in the output directory as it 99 # contains the names of the tests 100 shutil.copy(test_manifest_path, pathlib.Path(output_dir, test_manifest_path.name)) 101 102 103 if __name__ == "__main__": 104 moz_fetches_dir = os.environ.get("MOZ_FETCHES_DIR", "") 105 if not moz_fetches_dir: 106 raise Exception( 107 "MOZ_FETCHES_DIR is not set to the path containing the pdf.js repo" 108 ) 109 110 pdf_js_repo = pathlib.Path(moz_fetches_dir, "pdf.js") 111 if not pdf_js_repo.exists(): 112 raise Exception("Can't find the pdf.js repository in MOZ_FETCHES_DIR") 113 114 output_dir = os.environ.get("OUTPUT_DIR", "") 115 if not output_dir: 116 raise Exception("OUTPUT_DIR is not set for the file output") 117 118 output_dir_path = pathlib.Path(output_dir) 119 output_dir_path.mkdir(parents=True, exist_ok=True) 120 gather_talos_pdfs(pdf_js_repo, output_dir_path)