tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

browsertime_benchmark.py (23547B)


      1 # This Source Code Form is subject to the terms of the Mozilla Public
      2 # License, v. 2.0. If a copy of the MPL was not distributed with this
      3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 import json
      5 import pathlib
      6 import sys
      7 from collections.abc import Iterable
      8 
      9 import filters
     10 
     11 sys.path.insert(0, str(pathlib.Path(__file__).parent))
     12 from browsertime_pageload import PageloadSupport
     13 from logger.logger import RaptorLogger
     14 
     15 LOG = RaptorLogger(component="perftest-support-class")
     16 
     17 METRIC_BLOCKLIST = [
     18    "mean",
     19    "median",
     20    "geomean",
     21 ]
     22 
     23 
     24 class MissingBenchmarkResultsError(Exception):
     25    """
     26    This error is raised when the benchmark results from a test
     27    run do not contain the `browsertime_benchmark` entry in the dict
     28    of extra data.
     29    """
     30 
     31    pass
     32 
     33 
     34 class BenchmarkSupport(PageloadSupport):
     35    def __init__(self, **kwargs):
     36        super().__init__(**kwargs)
     37        self.failed_tests = []
     38        self.youtube_playback_failure = False
     39 
     40    def setup_test(self, next_test, args):
     41        super().setup_test(next_test, args)
     42        if next_test.get("custom_data", False) == "true":
     43            raise ValueError(
     44                "Cannot use BenchmarkSupport class for custom data, a "
     45                "new support class should be built for that use case."
     46            )
     47 
     48    def modify_command(self, cmd, test):
     49        # Enable cpuTime, and wallclock-tracking metrics
     50        cmd.extend([
     51            "--browsertime.cpuTime_test",
     52            "true",
     53            "--browsertime.wallclock_tracking_test",
     54            "true",
     55        ])
     56 
     57    def handle_result(self, bt_result, raw_result, **kwargs):
     58        """Parse a result for the required results.
     59 
     60        See base_python_support.py for what's expected from this method.
     61        """
     62        # Each entry here is a separate cold pageload iteration (or browser cycle)
     63        for custom_types in raw_result["extras"]:
     64            browsertime_benchmark_results = custom_types.get("browsertime_benchmark")
     65            if not browsertime_benchmark_results:
     66                raise MissingBenchmarkResultsError(
     67                    "Could not find `browsertime_benchmark` entry "
     68                    "in the browsertime `extra` results"
     69                )
     70            for metric, values in browsertime_benchmark_results.items():
     71                bt_result["measurements"].setdefault(metric, []).append(values)
     72 
     73        if self.perfstats:
     74            for cycle in raw_result["geckoPerfStats"]:
     75                for metric in cycle:
     76                    bt_result["measurements"].setdefault(
     77                        "perfstat-" + metric, []
     78                    ).append(cycle[metric])
     79 
     80    def parseYoutubePlaybackPerformanceOutput(self, test):
     81        """Parse the metrics for the Youtube playback performance test.
     82 
     83        For each video measured values for dropped and decoded frames will be
     84        available from the benchmark site.
     85 
     86        {u'PlaybackPerf.VP9.2160p60@2X': {u'droppedFrames': 1, u'decodedFrames': 796}
     87 
     88        With each page cycle / iteration of the test multiple values can be present.
     89 
     90        Raptor will calculate the percentage of dropped frames to decoded frames.
     91        All those three values will then be emitted as separate sub tests.
     92        """
     93        _subtests = {}
     94        test_name = [
     95            measurement
     96            for measurement in test["measurements"].keys()
     97            if "youtube-playback" in measurement
     98        ]
     99        if len(test_name) > 0:
    100            data = test["measurements"].get(test_name[0])
    101        else:
    102            raise Exception("No measurements found for youtube test!")
    103 
    104        def create_subtest_entry(
    105            name,
    106            value,
    107            unit=test["subtest_unit"],
    108            lower_is_better=test["subtest_lower_is_better"],
    109        ):
    110            # build a list of subtests and append all related replicates
    111            if name not in _subtests:
    112                # subtest not added yet, first pagecycle, so add new one
    113                _subtests[name] = {
    114                    "name": name,
    115                    "unit": unit,
    116                    "lowerIsBetter": lower_is_better,
    117                    "replicates": [],
    118                }
    119 
    120            _subtests[name]["replicates"].append(value)
    121            if self.subtest_alert_on is not None:
    122                if name in self.subtest_alert_on:
    123                    LOG.info(
    124                        "turning on subtest alerting for measurement type: %s" % name
    125                    )
    126                    _subtests[name]["shouldAlert"] = True
    127 
    128        for pagecycle in data:
    129            for _sub, _value in pagecycle[0].items():
    130                if _value["decodedFrames"] == 0:
    131                    self.failed_tests.append(
    132                        "%s test Failed. decodedFrames %s droppedFrames %s."
    133                        % (_sub, _value["decodedFrames"], _value["droppedFrames"])
    134                    )
    135 
    136                try:
    137                    percent_dropped = (
    138                        float(_value["droppedFrames"]) / _value["decodedFrames"] * 100.0
    139                    )
    140                except ZeroDivisionError:
    141                    # if no frames have been decoded the playback failed completely
    142                    percent_dropped = 100.0
    143 
    144                # Remove the not needed "PlaybackPerf." prefix from each test
    145                _sub = _sub.split("PlaybackPerf", 1)[-1]
    146                if _sub.startswith("."):
    147                    _sub = _sub[1:]
    148 
    149                # build a list of subtests and append all related replicates
    150                create_subtest_entry(
    151                    f"{_sub}_decoded_frames",
    152                    _value["decodedFrames"],
    153                    lower_is_better=False,
    154                )
    155                create_subtest_entry(f"{_sub}_dropped_frames", _value["droppedFrames"])
    156                create_subtest_entry(f"{_sub}_%_dropped_frames", percent_dropped)
    157 
    158        # Check if any youtube test failed and generate exception
    159        if len(self.failed_tests) > 0:
    160            self.youtube_playback_failure = True
    161        vals = []
    162        subtests = []
    163        names = list(_subtests)
    164        names.sort(reverse=True)
    165        for name in names:
    166            # pylint: disable=W1633
    167            _subtests[name]["value"] = round(
    168                float(filters.median(_subtests[name]["replicates"])), 2
    169            )
    170            subtests.append(_subtests[name])
    171            # only include dropped_frames values, without the %_dropped_frames values
    172            if name.endswith("X_dropped_frames"):
    173                vals.append([_subtests[name]["value"], name])
    174 
    175        return subtests, vals
    176 
    177    def parseWebCodecsOutput(self, test):
    178        """
    179        Example output (this is one page cycle):
    180 
    181        {
    182            'name': 'webcodecs',
    183            'type': 'benchmark',
    184            'measurements': {
    185            'webcodecs': [
    186                ['{
    187                    "vp8 realtime encode": {
    188                        "frame-to-frame mean (key)": {"value":5.222857,"unit":"ms"},
    189                        "frame-to-frame cv (key)":{"value":27.052957,"unit":"%"},
    190                        "frame-dropping rate (key)":{"value":0,"unit":"%"},
    191                        "frame-to-frame mean (non key)":{"value":1.460678,"unit":"ms"},
    192                        "frame-to-frame cv (non key)":{"value":65.4360136,"unit":"%"},
    193                        "frame-dropping rate (non key)":{"value":0,"unit":"%"}
    194                    }
    195                }'],
    196                ...
    197            ]
    198            },
    199            'lower_is_better': False,
    200            'unit': 'score'
    201        }
    202        """
    203 
    204        data = test["measurements"]["webcodecs"]
    205        results = {}
    206        for page_cycle in data:
    207            d = json.loads(page_cycle[0])
    208            for test_name, test_data in d.items():
    209                results.setdefault(test_name, []).append(test_data)
    210 
    211        _subtests = {}
    212        for test_name in results:
    213            for result in results[test_name]:
    214                for subtest_name, subtest_result in result.items():
    215                    subtest_result_name = f"{test_name} - {subtest_name}"
    216                    _subtests.setdefault(
    217                        subtest_result_name,
    218                        {
    219                            "unit": subtest_result["unit"],
    220                            "alertThreshold": float(test["alert_threshold"]),
    221                            "lowerIsBetter": test["subtest_lower_is_better"],
    222                            "name": subtest_result_name,
    223                            "replicates": [],
    224                            "shouldAlert": True,
    225                        },
    226                    )["replicates"].append(subtest_result["value"])
    227 
    228            for subtest_name in results[test_name]:
    229                for subtest_name in result:
    230                    subtest_result_name = f"{test_name} - {subtest_name}"
    231                    _subtests[subtest_result_name]["value"] = filters.median(
    232                        _subtests[subtest_result_name]["replicates"]
    233                    )
    234 
    235        subtests = sorted(_subtests.values(), key=lambda x: x["name"], reverse=True)
    236        for subtest in subtests:
    237            if isinstance(subtest["value"], float):
    238                subtest["value"] = round(subtest["value"], 3)
    239        vals = [[subtest["value"], subtest["name"]] for subtest in subtests]
    240        return subtests, vals
    241 
    242    def parseUnknown(self, test):
    243        # Attempt to flatten whatever we've been given
    244        # Dictionary keys will be joined by dashes, arrays represent
    245        # represent "iterations"
    246        _subtests = {}
    247 
    248        if not isinstance(test["measurements"], dict):
    249            raise Exception(
    250                "Expected a dictionary with a single entry as the name of the test. "
    251                "The value of this key should be the data."
    252            )
    253 
    254        for iteration in test["measurements"][list(test["measurements"].keys())[0]]:
    255            flattened_metrics = None
    256 
    257            for metric, value in (flattened_metrics or iteration).items():
    258                if metric in METRIC_BLOCKLIST:
    259                    # TODO: Add an option in the test manifest for this
    260                    continue
    261                if metric not in _subtests:
    262                    # subtest not added yet, first pagecycle, so add new one
    263                    _subtests[metric] = {
    264                        "unit": test["subtest_unit"],
    265                        "alertThreshold": float(test["alert_threshold"]),
    266                        "lowerIsBetter": test["subtest_lower_is_better"],
    267                        "name": metric,
    268                        "replicates": [],
    269                    }
    270                updated_metric = value
    271                if not isinstance(value, Iterable):
    272                    updated_metric = [value]
    273                # pylint: disable=W1633
    274                _subtests[metric]["replicates"].extend([
    275                    round(x, 3) for x in updated_metric
    276                ])
    277 
    278        vals = []
    279        subtests = []
    280        names = list(_subtests)
    281        names.sort(reverse=True)
    282        summaries = {
    283            "median": filters.median,
    284            "mean": filters.mean,
    285            "geomean": filters.geometric_mean,
    286        }
    287        for name in names:
    288            summary_method = test.get("submetric_summary_method", "median")
    289            _subtests[name]["value"] = round(
    290                summaries[summary_method](_subtests[name]["replicates"]), 3
    291            )
    292            subtests.append(_subtests[name])
    293            vals.append([_subtests[name]["value"], name])
    294 
    295        return subtests, vals
    296 
    297    def construct_summary(self, vals, testname, unit=None):
    298        def _filter(vals, value=None):
    299            if value is None:
    300                return [i for i, j in vals]
    301            return [i for i, j in vals if j == value]
    302 
    303        if testname.startswith("raptor-v8_7"):
    304            return 100 * filters.geometric_mean(_filter(vals))
    305 
    306        if testname == "speedometer3":
    307            score = None
    308            for val, name in vals:
    309                if name == "score":
    310                    score = val
    311            if score is None:
    312                raise Exception("Unable to find score for Speedometer 3")
    313            return score
    314 
    315        if "speedometer" in testname:
    316            correctionFactor = 3
    317            results = _filter(vals)
    318            # speedometer has 16 tests, each of these are made of up 9 subtests
    319            # and a sum of the 9 values.  We receive 160 values, and want to use
    320            # the 16 test values, not the sub test values.
    321            if len(results) != 160:
    322                raise Exception(
    323                    "Speedometer has 160 subtests, found: %s instead" % len(results)
    324                )
    325 
    326            results = results[9::10]
    327            # pylint --py3k W1619
    328            score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
    329            return score
    330 
    331        if "stylebench" in testname:
    332            # see https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888
    333            correctionFactor = 3
    334            results = _filter(vals)
    335 
    336            # stylebench has 6 tests. Five of them are made of up 5 subtests
    337            #
    338            #   * Adding classes.
    339            #   * Removing classes.
    340            #   * Mutating attributes.
    341            #   * Adding leaf elements.
    342            #   * Removing leaf elements.
    343            #
    344            # which are made of two subtests each (sync/async) and repeated 5 times
    345            # each, thus, the list here looks like:
    346            #
    347            #   [Test name/Adding classes - 0/ Sync; <x>]
    348            #   [Test name/Adding classes - 0/ Async; <y>]
    349            #   [Test name/Adding classes - 0; <x> + <y>]
    350            #   [Test name/Removing classes - 0/ Sync; <x>]
    351            #   [Test name/Removing classes - 0/ Async; <y>]
    352            #   [Test name/Removing classes - 0; <x> + <y>]
    353            #   ...
    354            #   [Test name/Adding classes - 1 / Sync; <x>]
    355            #   [Test name/Adding classes - 1 / Async; <y>]
    356            #   [Test name/Adding classes - 1 ; <x> + <y>]
    357            #   ...
    358            #   [Test name/Removing leaf elements - 4; <x> + <y>]
    359            #   [Test name; <sum>] <- This is what we want.
    360            #
    361            # So, 5 (subtests) *
    362            #     5 (repetitions) *
    363            #     3 (entries per repetition (sync/async/sum)) =
    364            #     75 entries for test before the sum.
    365            #
    366            # We receive 76 entries per test, which ads up to 380. We want to use
    367            # the 5 test entries, not the rest.
    368            #
    369            # Then there's the sixth "Dynamic media queries" test, which gives
    370            # results for viewports in increments of 50px like:
    371            #
    372            #   Dynamic media queries/Resizing to 300px - 0/Sync
    373            #   Dynamic media queries/Resizing to 300px - 0/Async
    374            #   Dynamic media queries/Resizing to 300px - 0
    375            #   Dynamic media queries/Resizing to 350px - 0/Sync
    376            #   Dynamic media queries/Resizing to 350px - 0/Async
    377            #   Dynamic media queries/Resizing to 350px - 0
    378            #   ...
    379            #   Dynamic media queries/Resizing to 800px - 0/Sync
    380            #   Dynamic media queries/Resizing to 800px - 0/Async
    381            #   Dynamic media queries/Resizing to 800px - 0
    382            #   Dynamic media queries/Resizing to 350px - 1/Sync
    383            #   Dynamic media queries/Resizing to 350px - 1/Async
    384            #   Dynamic media queries/Resizing to 350px - 1
    385            #   Dynamic media queries/Resizing to 400px - 1/Sync
    386            #   Dynamic media queries/Resizing to 400px - 1/Async
    387            #   Dynamic media queries/Resizing to 400px - 1
    388            #   ...
    389            #   Dynamic media queries/Resizing to 800px - 4/Sync
    390            #   Dynamic media queries/Resizing to 800px - 4/Async
    391            #   Dynamic media queries/Resizing to 800px - 4
    392            #   Dynamic media queries <- What we want
    393            #
    394            # So len([300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]) is 11.
    395            #
    396            # So, 11 (subtests) *
    397            #     5 (repetitions) *
    398            #     3 (entries per repetition (sync/async/sum)) =
    399            #     165 entries for test before the sum.
    400            EXPECTED_ENTRIES = 380 + 166
    401            if len(results) != EXPECTED_ENTRIES:
    402                raise Exception(
    403                    f"StyleBench requires {EXPECTED_ENTRIES} entries, found: {len(results)} instead"
    404                )
    405            results = results[:380][75::76] + [results[-1]]
    406            # pylint --py3k W1619
    407            return 60 * 1000 / filters.geometric_mean(results) / correctionFactor
    408 
    409        if testname.startswith("raptor-kraken") or "sunspider" in testname:
    410            return sum(_filter(vals))
    411 
    412        if "unity-webgl" in testname or "webaudio" in testname:
    413            # webaudio_score and unity_webgl_score: self reported as 'Geometric Mean'
    414            return filters.mean(_filter(vals, "Geometric Mean"))
    415 
    416        if "assorted-dom" in testname:
    417            # pylint: disable=W1633
    418            return round(filters.geometric_mean(_filter(vals)), 2)
    419 
    420        if "wasm-misc" in testname:
    421            # wasm_misc_score: self reported as '__total__'
    422            return filters.mean(_filter(vals, "__total__"))
    423 
    424        if "wasm-godot" in testname:
    425            # wasm_godot_score: first-interactive mean
    426            return filters.mean(_filter(vals, "first-interactive"))
    427 
    428        if "youtube-playback" in testname:
    429            # pylint: disable=W1633
    430            return round(filters.mean(_filter(vals)), 2)
    431 
    432        if "twitch-animation" in testname:
    433            return round(filters.geometric_mean(_filter(vals, "run")), 2)
    434 
    435        if "ve" in testname:
    436            if "rt" in testname:
    437                # We collect the mean and cv of frame-to-frame performance and the
    438                # frame-dropping rate for both keyframe and non-keyframe. However,
    439                # the most important factor is the frame-to-frame mean, so we only
    440                # include it in the summarized score. Note that all the values
    441                # collected are monitored by "shouldAlert".
    442                means = [i for i, j in vals if "mean" in j]
    443                if len(means) > 0:
    444                    return round(filters.geometric_mean(means), 2)
    445                return -1
    446 
    447            if "q" in testname:
    448                if len(vals) > 0:
    449                    return round(filters.mean(_filter(vals)), 2)
    450                return -1
    451 
    452            raise NotImplementedError("Summary for %s is not implemented" % testname)
    453 
    454        if testname.startswith("supporting_data"):
    455            if not unit:
    456                return sum(_filter(vals))
    457 
    458            if unit == "%":
    459                return filters.mean(_filter(vals))
    460 
    461            if unit in ("W", "MHz"):
    462                # For power in Watts and clock frequencies,
    463                # summarize with the sum of the averages
    464                allavgs = []
    465                for val, subtest in vals:
    466                    if "avg" in subtest:
    467                        allavgs.append(val)
    468                if allavgs:
    469                    return sum(allavgs)
    470 
    471                raise Exception(
    472                    "No average measurements found for supporting data with W, or MHz unit ."
    473                )
    474 
    475            if unit in ["KB", "mAh", "mWh"]:
    476                return sum(_filter(vals))
    477 
    478            raise NotImplementedError("Unit %s not suported" % unit)
    479 
    480        if len(vals) > 1:
    481            # pylint: disable=W1633
    482            return round(filters.geometric_mean(_filter(vals)), 2)
    483 
    484        # pylint: disable=W1633
    485        return round(filters.mean(_filter(vals)), 2)
    486 
    487    def _process_measurements(self, suite, test, measurement_name, replicates):
    488        subtest = {}
    489        subtest["name"] = measurement_name
    490        subtest["lowerIsBetter"] = test["subtest_lower_is_better"]
    491        subtest["alertThreshold"] = float(test["alert_threshold"])
    492 
    493        unit = test["subtest_unit"]
    494        if measurement_name == "cpuTime":
    495            unit = "ms"
    496        elif measurement_name == "powerUsage":
    497            unit = "uWh"
    498        subtest["unit"] = unit
    499 
    500        # Add the alert window settings if needed here too in case
    501        # there is no summary value in the test
    502        for schema_name in (
    503            "minBackWindow",
    504            "maxBackWindow",
    505            "foreWindow",
    506        ):
    507            if suite.get(schema_name, None) is not None:
    508                subtest[schema_name] = suite[schema_name]
    509 
    510        # if 'alert_on' is set for this particular measurement, then we want to set
    511        # the flag in the perfherder output to turn on alerting for this subtest
    512        if self.subtest_alert_on is not None:
    513            if measurement_name in self.subtest_alert_on:
    514                LOG.info(
    515                    "turning on subtest alerting for measurement type: %s"
    516                    % measurement_name
    517                )
    518                subtest["shouldAlert"] = True
    519                if self.app in (
    520                    "chrome",
    521                    "chrome-m",
    522                    "custom-car",
    523                    "cstm-car-m",
    524                ):
    525                    subtest["shouldAlert"] = False
    526            else:
    527                # Explicitly set `shouldAlert` to False so that the measurement
    528                # is not alerted on. Otherwise Perfherder defaults to alerting.
    529                LOG.info(
    530                    "turning off subtest alerting for measurement type: %s"
    531                    % measurement_name
    532                )
    533                subtest["shouldAlert"] = False
    534 
    535        if self.power_test and measurement_name == "powerUsage":
    536            subtest["shouldAlert"] = True
    537 
    538        subtest["replicates"] = replicates
    539        return subtest
    540 
    541    def summarize_test(self, test, suite, **kwargs):
    542        subtests = None
    543        if "youtube-playback" in test["name"]:
    544            subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test)
    545        elif "ve" in test["name"]:
    546            subtests, vals = self.parseWebCodecsOutput(test)
    547        else:
    548            # Attempt to parse the unknown benchmark by flattening the
    549            # given data and merging all the arrays of non-iterable
    550            # data that fall under the same key.
    551            # XXX Note that this is not fully implemented for the summary
    552            # of the metric or test as we don't have a use case for that yet.
    553            subtests, vals = self.parseUnknown(test)
    554 
    555        if subtests is None:
    556            raise Exception("No benchmark metrics found in browsertime results")
    557 
    558        suite["subtests"] = subtests
    559 
    560        self.add_additional_metrics(test, suite)
    561 
    562        # summarize results for both benchmark type tests
    563        if len(subtests) > 1:
    564            suite["value"] = self.construct_summary(vals, testname=test["name"])
    565        subtests.sort(key=lambda subtest: subtest["name"])
    566 
    567    def summarize_suites(self, suites):
    568        pass
    569 
    570    def report_test_success(self):
    571        if len(self.failed_tests) > 0:
    572            LOG.warning("Some tests failed.")
    573            if self.youtube_playback_failure:
    574                for test in self.failed_tests:
    575                    LOG.warning("Youtube sub-test FAILED: %s" % test)
    576                LOG.warning(
    577                    "Youtube playback sub-tests failed!!! "
    578                    "Not submitting results to perfherder!"
    579                )
    580            return False
    581        return True