tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fetch-talos-pdfs.py (4126B)


      1 #!/usr/bin/env python3
      2 
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 """
      8 This script downloads all the required PDFs from the test_manifest.json
      9 file found in the mozilla pdf.js repo.
     10 """
     11 
     12 import json
     13 import os
     14 import pathlib
     15 import shutil
     16 
     17 import requests
     18 from redo import retriable
     19 
     20 
     21 def log(msg):
     22    print("fetch-talos-pdf: %s" % msg)
     23 
     24 
     25 @retriable(attempts=7, sleeptime=5, sleepscale=2)
     26 def fetch_file(url, filepath):
     27    """Download a file from the given url to a given file.
     28 
     29    :param str url: URL to download file from.
     30    :param Path filepath: Location to ouput the downloaded file
     31        (includes the name of the file).
     32    """
     33    size = 4096
     34    r = requests.get(url, stream=True)
     35    r.raise_for_status()
     36 
     37    with filepath.open("wb") as fd:
     38        for chunk in r.iter_content(size):
     39            fd.write(chunk)
     40 
     41 
     42 def fetch_talos_pdf_link(pdf_path, output_file):
     43    """Fetches a PDF file with a link into the output file location.
     44 
     45    :param Path pdf_path: Path to a PDF file that contains a URL to download from.
     46    :param Path output_file: Location (including the file name) to download PDF to.
     47    """
     48    pdf_link = pdf_path.read_text().strip()
     49    log(f"Downloading from PDF link: {pdf_link}")
     50    fetch_file(pdf_link, output_file)
     51 
     52 
     53 def gather_talos_pdf(test_folder, pdf_info, output_dir):
     54    """Gathers a PDF file into the output directory.
     55 
     56    :param Path test_folder: The test folder that the pdfs can be found in.
     57    :param Path pdf_info: Information about the pdf we're currently gathering, and
     58        found in the test/test_manifest.json file from the pdf.js repo.
     59    :param Path output_dir: The directory to move/download the PDF to.
     60    """
     61    if not pdf_info.get("talos", True):
     62        return
     63    pdf_file = pdf_info["file"]
     64    output_pdf_path = pathlib.Path(output_dir, pathlib.Path(pdf_file).name)
     65 
     66    log(f"Gathering PDF {pdf_file}...")
     67    if output_pdf_path.exists():
     68        log(f"{pdf_file} already exists in output location")
     69    elif pdf_info.get("link", False):
     70        fetch_talos_pdf_link(
     71            pathlib.Path(test_folder, pdf_file + ".link"), output_pdf_path
     72        )
     73    else:
     74        log(f"Copying PDF to output location {output_pdf_path}")
     75        shutil.copy(pathlib.Path(test_folder, pdf_file), output_pdf_path)
     76 
     77 
     78 def gather_talos_pdfs(pdf_js_repo, output_dir):
     79    """Gather all pdfs to be used in the talos pdfpaint test.
     80 
     81    Uses the pdf.js repo to gather the files from it's test/test_manifest.json
     82    file. Some of these are also links that need to be downloaded. These
     83    are output in an output directory.
     84 
     85    :param Path pdf_js_repo: Path to the Mozilla Github pdf.js repo.
     86    :param Path output_dir: Output directory for the PDFs.
     87    """
     88    test_manifest_path = pathlib.Path(
     89        pdf_js_repo, "test", "test_manifest.json"
     90    ).resolve()
     91    test_folder = test_manifest_path.parent
     92 
     93    # Gather all the PDFs into the output directory
     94    test_manifest = json.loads(test_manifest_path.read_text())
     95    for pdf_info in test_manifest:
     96        gather_talos_pdf(test_folder, pdf_info, output_dir)
     97 
     98    # Include the test manifest in the output directory as it
     99    # contains the names of the tests
    100    shutil.copy(test_manifest_path, pathlib.Path(output_dir, test_manifest_path.name))
    101 
    102 
    103 if __name__ == "__main__":
    104    moz_fetches_dir = os.environ.get("MOZ_FETCHES_DIR", "")
    105    if not moz_fetches_dir:
    106        raise Exception(
    107            "MOZ_FETCHES_DIR is not set to the path containing the pdf.js repo"
    108        )
    109 
    110    pdf_js_repo = pathlib.Path(moz_fetches_dir, "pdf.js")
    111    if not pdf_js_repo.exists():
    112        raise Exception("Can't find the pdf.js repository in MOZ_FETCHES_DIR")
    113 
    114    output_dir = os.environ.get("OUTPUT_DIR", "")
    115    if not output_dir:
    116        raise Exception("OUTPUT_DIR is not set for the file output")
    117 
    118    output_dir_path = pathlib.Path(output_dir)
    119    output_dir_path.mkdir(parents=True, exist_ok=True)
    120    gather_talos_pdfs(pdf_js_repo, output_dir_path)