browsertime_benchmark.py (23547B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 import json 5 import pathlib 6 import sys 7 from collections.abc import Iterable 8 9 import filters 10 11 sys.path.insert(0, str(pathlib.Path(__file__).parent)) 12 from browsertime_pageload import PageloadSupport 13 from logger.logger import RaptorLogger 14 15 LOG = RaptorLogger(component="perftest-support-class") 16 17 METRIC_BLOCKLIST = [ 18 "mean", 19 "median", 20 "geomean", 21 ] 22 23 24 class MissingBenchmarkResultsError(Exception): 25 """ 26 This error is raised when the benchmark results from a test 27 run do not contain the `browsertime_benchmark` entry in the dict 28 of extra data. 29 """ 30 31 pass 32 33 34 class BenchmarkSupport(PageloadSupport): 35 def __init__(self, **kwargs): 36 super().__init__(**kwargs) 37 self.failed_tests = [] 38 self.youtube_playback_failure = False 39 40 def setup_test(self, next_test, args): 41 super().setup_test(next_test, args) 42 if next_test.get("custom_data", False) == "true": 43 raise ValueError( 44 "Cannot use BenchmarkSupport class for custom data, a " 45 "new support class should be built for that use case." 46 ) 47 48 def modify_command(self, cmd, test): 49 # Enable cpuTime, and wallclock-tracking metrics 50 cmd.extend([ 51 "--browsertime.cpuTime_test", 52 "true", 53 "--browsertime.wallclock_tracking_test", 54 "true", 55 ]) 56 57 def handle_result(self, bt_result, raw_result, **kwargs): 58 """Parse a result for the required results. 59 60 See base_python_support.py for what's expected from this method. 61 """ 62 # Each entry here is a separate cold pageload iteration (or browser cycle) 63 for custom_types in raw_result["extras"]: 64 browsertime_benchmark_results = custom_types.get("browsertime_benchmark") 65 if not browsertime_benchmark_results: 66 raise MissingBenchmarkResultsError( 67 "Could not find `browsertime_benchmark` entry " 68 "in the browsertime `extra` results" 69 ) 70 for metric, values in browsertime_benchmark_results.items(): 71 bt_result["measurements"].setdefault(metric, []).append(values) 72 73 if self.perfstats: 74 for cycle in raw_result["geckoPerfStats"]: 75 for metric in cycle: 76 bt_result["measurements"].setdefault( 77 "perfstat-" + metric, [] 78 ).append(cycle[metric]) 79 80 def parseYoutubePlaybackPerformanceOutput(self, test): 81 """Parse the metrics for the Youtube playback performance test. 82 83 For each video measured values for dropped and decoded frames will be 84 available from the benchmark site. 85 86 {u'PlaybackPerf.VP9.2160p60@2X': {u'droppedFrames': 1, u'decodedFrames': 796} 87 88 With each page cycle / iteration of the test multiple values can be present. 89 90 Raptor will calculate the percentage of dropped frames to decoded frames. 91 All those three values will then be emitted as separate sub tests. 92 """ 93 _subtests = {} 94 test_name = [ 95 measurement 96 for measurement in test["measurements"].keys() 97 if "youtube-playback" in measurement 98 ] 99 if len(test_name) > 0: 100 data = test["measurements"].get(test_name[0]) 101 else: 102 raise Exception("No measurements found for youtube test!") 103 104 def create_subtest_entry( 105 name, 106 value, 107 unit=test["subtest_unit"], 108 lower_is_better=test["subtest_lower_is_better"], 109 ): 110 # build a list of subtests and append all related replicates 111 if name not in _subtests: 112 # subtest not added yet, first pagecycle, so add new one 113 _subtests[name] = { 114 "name": name, 115 "unit": unit, 116 "lowerIsBetter": lower_is_better, 117 "replicates": [], 118 } 119 120 _subtests[name]["replicates"].append(value) 121 if self.subtest_alert_on is not None: 122 if name in self.subtest_alert_on: 123 LOG.info( 124 "turning on subtest alerting for measurement type: %s" % name 125 ) 126 _subtests[name]["shouldAlert"] = True 127 128 for pagecycle in data: 129 for _sub, _value in pagecycle[0].items(): 130 if _value["decodedFrames"] == 0: 131 self.failed_tests.append( 132 "%s test Failed. decodedFrames %s droppedFrames %s." 133 % (_sub, _value["decodedFrames"], _value["droppedFrames"]) 134 ) 135 136 try: 137 percent_dropped = ( 138 float(_value["droppedFrames"]) / _value["decodedFrames"] * 100.0 139 ) 140 except ZeroDivisionError: 141 # if no frames have been decoded the playback failed completely 142 percent_dropped = 100.0 143 144 # Remove the not needed "PlaybackPerf." prefix from each test 145 _sub = _sub.split("PlaybackPerf", 1)[-1] 146 if _sub.startswith("."): 147 _sub = _sub[1:] 148 149 # build a list of subtests and append all related replicates 150 create_subtest_entry( 151 f"{_sub}_decoded_frames", 152 _value["decodedFrames"], 153 lower_is_better=False, 154 ) 155 create_subtest_entry(f"{_sub}_dropped_frames", _value["droppedFrames"]) 156 create_subtest_entry(f"{_sub}_%_dropped_frames", percent_dropped) 157 158 # Check if any youtube test failed and generate exception 159 if len(self.failed_tests) > 0: 160 self.youtube_playback_failure = True 161 vals = [] 162 subtests = [] 163 names = list(_subtests) 164 names.sort(reverse=True) 165 for name in names: 166 # pylint: disable=W1633 167 _subtests[name]["value"] = round( 168 float(filters.median(_subtests[name]["replicates"])), 2 169 ) 170 subtests.append(_subtests[name]) 171 # only include dropped_frames values, without the %_dropped_frames values 172 if name.endswith("X_dropped_frames"): 173 vals.append([_subtests[name]["value"], name]) 174 175 return subtests, vals 176 177 def parseWebCodecsOutput(self, test): 178 """ 179 Example output (this is one page cycle): 180 181 { 182 'name': 'webcodecs', 183 'type': 'benchmark', 184 'measurements': { 185 'webcodecs': [ 186 ['{ 187 "vp8 realtime encode": { 188 "frame-to-frame mean (key)": {"value":5.222857,"unit":"ms"}, 189 "frame-to-frame cv (key)":{"value":27.052957,"unit":"%"}, 190 "frame-dropping rate (key)":{"value":0,"unit":"%"}, 191 "frame-to-frame mean (non key)":{"value":1.460678,"unit":"ms"}, 192 "frame-to-frame cv (non key)":{"value":65.4360136,"unit":"%"}, 193 "frame-dropping rate (non key)":{"value":0,"unit":"%"} 194 } 195 }'], 196 ... 197 ] 198 }, 199 'lower_is_better': False, 200 'unit': 'score' 201 } 202 """ 203 204 data = test["measurements"]["webcodecs"] 205 results = {} 206 for page_cycle in data: 207 d = json.loads(page_cycle[0]) 208 for test_name, test_data in d.items(): 209 results.setdefault(test_name, []).append(test_data) 210 211 _subtests = {} 212 for test_name in results: 213 for result in results[test_name]: 214 for subtest_name, subtest_result in result.items(): 215 subtest_result_name = f"{test_name} - {subtest_name}" 216 _subtests.setdefault( 217 subtest_result_name, 218 { 219 "unit": subtest_result["unit"], 220 "alertThreshold": float(test["alert_threshold"]), 221 "lowerIsBetter": test["subtest_lower_is_better"], 222 "name": subtest_result_name, 223 "replicates": [], 224 "shouldAlert": True, 225 }, 226 )["replicates"].append(subtest_result["value"]) 227 228 for subtest_name in results[test_name]: 229 for subtest_name in result: 230 subtest_result_name = f"{test_name} - {subtest_name}" 231 _subtests[subtest_result_name]["value"] = filters.median( 232 _subtests[subtest_result_name]["replicates"] 233 ) 234 235 subtests = sorted(_subtests.values(), key=lambda x: x["name"], reverse=True) 236 for subtest in subtests: 237 if isinstance(subtest["value"], float): 238 subtest["value"] = round(subtest["value"], 3) 239 vals = [[subtest["value"], subtest["name"]] for subtest in subtests] 240 return subtests, vals 241 242 def parseUnknown(self, test): 243 # Attempt to flatten whatever we've been given 244 # Dictionary keys will be joined by dashes, arrays represent 245 # represent "iterations" 246 _subtests = {} 247 248 if not isinstance(test["measurements"], dict): 249 raise Exception( 250 "Expected a dictionary with a single entry as the name of the test. " 251 "The value of this key should be the data." 252 ) 253 254 for iteration in test["measurements"][list(test["measurements"].keys())[0]]: 255 flattened_metrics = None 256 257 for metric, value in (flattened_metrics or iteration).items(): 258 if metric in METRIC_BLOCKLIST: 259 # TODO: Add an option in the test manifest for this 260 continue 261 if metric not in _subtests: 262 # subtest not added yet, first pagecycle, so add new one 263 _subtests[metric] = { 264 "unit": test["subtest_unit"], 265 "alertThreshold": float(test["alert_threshold"]), 266 "lowerIsBetter": test["subtest_lower_is_better"], 267 "name": metric, 268 "replicates": [], 269 } 270 updated_metric = value 271 if not isinstance(value, Iterable): 272 updated_metric = [value] 273 # pylint: disable=W1633 274 _subtests[metric]["replicates"].extend([ 275 round(x, 3) for x in updated_metric 276 ]) 277 278 vals = [] 279 subtests = [] 280 names = list(_subtests) 281 names.sort(reverse=True) 282 summaries = { 283 "median": filters.median, 284 "mean": filters.mean, 285 "geomean": filters.geometric_mean, 286 } 287 for name in names: 288 summary_method = test.get("submetric_summary_method", "median") 289 _subtests[name]["value"] = round( 290 summaries[summary_method](_subtests[name]["replicates"]), 3 291 ) 292 subtests.append(_subtests[name]) 293 vals.append([_subtests[name]["value"], name]) 294 295 return subtests, vals 296 297 def construct_summary(self, vals, testname, unit=None): 298 def _filter(vals, value=None): 299 if value is None: 300 return [i for i, j in vals] 301 return [i for i, j in vals if j == value] 302 303 if testname.startswith("raptor-v8_7"): 304 return 100 * filters.geometric_mean(_filter(vals)) 305 306 if testname == "speedometer3": 307 score = None 308 for val, name in vals: 309 if name == "score": 310 score = val 311 if score is None: 312 raise Exception("Unable to find score for Speedometer 3") 313 return score 314 315 if "speedometer" in testname: 316 correctionFactor = 3 317 results = _filter(vals) 318 # speedometer has 16 tests, each of these are made of up 9 subtests 319 # and a sum of the 9 values. We receive 160 values, and want to use 320 # the 16 test values, not the sub test values. 321 if len(results) != 160: 322 raise Exception( 323 "Speedometer has 160 subtests, found: %s instead" % len(results) 324 ) 325 326 results = results[9::10] 327 # pylint --py3k W1619 328 score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor 329 return score 330 331 if "stylebench" in testname: 332 # see https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888 333 correctionFactor = 3 334 results = _filter(vals) 335 336 # stylebench has 6 tests. Five of them are made of up 5 subtests 337 # 338 # * Adding classes. 339 # * Removing classes. 340 # * Mutating attributes. 341 # * Adding leaf elements. 342 # * Removing leaf elements. 343 # 344 # which are made of two subtests each (sync/async) and repeated 5 times 345 # each, thus, the list here looks like: 346 # 347 # [Test name/Adding classes - 0/ Sync; <x>] 348 # [Test name/Adding classes - 0/ Async; <y>] 349 # [Test name/Adding classes - 0; <x> + <y>] 350 # [Test name/Removing classes - 0/ Sync; <x>] 351 # [Test name/Removing classes - 0/ Async; <y>] 352 # [Test name/Removing classes - 0; <x> + <y>] 353 # ... 354 # [Test name/Adding classes - 1 / Sync; <x>] 355 # [Test name/Adding classes - 1 / Async; <y>] 356 # [Test name/Adding classes - 1 ; <x> + <y>] 357 # ... 358 # [Test name/Removing leaf elements - 4; <x> + <y>] 359 # [Test name; <sum>] <- This is what we want. 360 # 361 # So, 5 (subtests) * 362 # 5 (repetitions) * 363 # 3 (entries per repetition (sync/async/sum)) = 364 # 75 entries for test before the sum. 365 # 366 # We receive 76 entries per test, which ads up to 380. We want to use 367 # the 5 test entries, not the rest. 368 # 369 # Then there's the sixth "Dynamic media queries" test, which gives 370 # results for viewports in increments of 50px like: 371 # 372 # Dynamic media queries/Resizing to 300px - 0/Sync 373 # Dynamic media queries/Resizing to 300px - 0/Async 374 # Dynamic media queries/Resizing to 300px - 0 375 # Dynamic media queries/Resizing to 350px - 0/Sync 376 # Dynamic media queries/Resizing to 350px - 0/Async 377 # Dynamic media queries/Resizing to 350px - 0 378 # ... 379 # Dynamic media queries/Resizing to 800px - 0/Sync 380 # Dynamic media queries/Resizing to 800px - 0/Async 381 # Dynamic media queries/Resizing to 800px - 0 382 # Dynamic media queries/Resizing to 350px - 1/Sync 383 # Dynamic media queries/Resizing to 350px - 1/Async 384 # Dynamic media queries/Resizing to 350px - 1 385 # Dynamic media queries/Resizing to 400px - 1/Sync 386 # Dynamic media queries/Resizing to 400px - 1/Async 387 # Dynamic media queries/Resizing to 400px - 1 388 # ... 389 # Dynamic media queries/Resizing to 800px - 4/Sync 390 # Dynamic media queries/Resizing to 800px - 4/Async 391 # Dynamic media queries/Resizing to 800px - 4 392 # Dynamic media queries <- What we want 393 # 394 # So len([300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]) is 11. 395 # 396 # So, 11 (subtests) * 397 # 5 (repetitions) * 398 # 3 (entries per repetition (sync/async/sum)) = 399 # 165 entries for test before the sum. 400 EXPECTED_ENTRIES = 380 + 166 401 if len(results) != EXPECTED_ENTRIES: 402 raise Exception( 403 f"StyleBench requires {EXPECTED_ENTRIES} entries, found: {len(results)} instead" 404 ) 405 results = results[:380][75::76] + [results[-1]] 406 # pylint --py3k W1619 407 return 60 * 1000 / filters.geometric_mean(results) / correctionFactor 408 409 if testname.startswith("raptor-kraken") or "sunspider" in testname: 410 return sum(_filter(vals)) 411 412 if "unity-webgl" in testname or "webaudio" in testname: 413 # webaudio_score and unity_webgl_score: self reported as 'Geometric Mean' 414 return filters.mean(_filter(vals, "Geometric Mean")) 415 416 if "assorted-dom" in testname: 417 # pylint: disable=W1633 418 return round(filters.geometric_mean(_filter(vals)), 2) 419 420 if "wasm-misc" in testname: 421 # wasm_misc_score: self reported as '__total__' 422 return filters.mean(_filter(vals, "__total__")) 423 424 if "wasm-godot" in testname: 425 # wasm_godot_score: first-interactive mean 426 return filters.mean(_filter(vals, "first-interactive")) 427 428 if "youtube-playback" in testname: 429 # pylint: disable=W1633 430 return round(filters.mean(_filter(vals)), 2) 431 432 if "twitch-animation" in testname: 433 return round(filters.geometric_mean(_filter(vals, "run")), 2) 434 435 if "ve" in testname: 436 if "rt" in testname: 437 # We collect the mean and cv of frame-to-frame performance and the 438 # frame-dropping rate for both keyframe and non-keyframe. However, 439 # the most important factor is the frame-to-frame mean, so we only 440 # include it in the summarized score. Note that all the values 441 # collected are monitored by "shouldAlert". 442 means = [i for i, j in vals if "mean" in j] 443 if len(means) > 0: 444 return round(filters.geometric_mean(means), 2) 445 return -1 446 447 if "q" in testname: 448 if len(vals) > 0: 449 return round(filters.mean(_filter(vals)), 2) 450 return -1 451 452 raise NotImplementedError("Summary for %s is not implemented" % testname) 453 454 if testname.startswith("supporting_data"): 455 if not unit: 456 return sum(_filter(vals)) 457 458 if unit == "%": 459 return filters.mean(_filter(vals)) 460 461 if unit in ("W", "MHz"): 462 # For power in Watts and clock frequencies, 463 # summarize with the sum of the averages 464 allavgs = [] 465 for val, subtest in vals: 466 if "avg" in subtest: 467 allavgs.append(val) 468 if allavgs: 469 return sum(allavgs) 470 471 raise Exception( 472 "No average measurements found for supporting data with W, or MHz unit ." 473 ) 474 475 if unit in ["KB", "mAh", "mWh"]: 476 return sum(_filter(vals)) 477 478 raise NotImplementedError("Unit %s not suported" % unit) 479 480 if len(vals) > 1: 481 # pylint: disable=W1633 482 return round(filters.geometric_mean(_filter(vals)), 2) 483 484 # pylint: disable=W1633 485 return round(filters.mean(_filter(vals)), 2) 486 487 def _process_measurements(self, suite, test, measurement_name, replicates): 488 subtest = {} 489 subtest["name"] = measurement_name 490 subtest["lowerIsBetter"] = test["subtest_lower_is_better"] 491 subtest["alertThreshold"] = float(test["alert_threshold"]) 492 493 unit = test["subtest_unit"] 494 if measurement_name == "cpuTime": 495 unit = "ms" 496 elif measurement_name == "powerUsage": 497 unit = "uWh" 498 subtest["unit"] = unit 499 500 # Add the alert window settings if needed here too in case 501 # there is no summary value in the test 502 for schema_name in ( 503 "minBackWindow", 504 "maxBackWindow", 505 "foreWindow", 506 ): 507 if suite.get(schema_name, None) is not None: 508 subtest[schema_name] = suite[schema_name] 509 510 # if 'alert_on' is set for this particular measurement, then we want to set 511 # the flag in the perfherder output to turn on alerting for this subtest 512 if self.subtest_alert_on is not None: 513 if measurement_name in self.subtest_alert_on: 514 LOG.info( 515 "turning on subtest alerting for measurement type: %s" 516 % measurement_name 517 ) 518 subtest["shouldAlert"] = True 519 if self.app in ( 520 "chrome", 521 "chrome-m", 522 "custom-car", 523 "cstm-car-m", 524 ): 525 subtest["shouldAlert"] = False 526 else: 527 # Explicitly set `shouldAlert` to False so that the measurement 528 # is not alerted on. Otherwise Perfherder defaults to alerting. 529 LOG.info( 530 "turning off subtest alerting for measurement type: %s" 531 % measurement_name 532 ) 533 subtest["shouldAlert"] = False 534 535 if self.power_test and measurement_name == "powerUsage": 536 subtest["shouldAlert"] = True 537 538 subtest["replicates"] = replicates 539 return subtest 540 541 def summarize_test(self, test, suite, **kwargs): 542 subtests = None 543 if "youtube-playback" in test["name"]: 544 subtests, vals = self.parseYoutubePlaybackPerformanceOutput(test) 545 elif "ve" in test["name"]: 546 subtests, vals = self.parseWebCodecsOutput(test) 547 else: 548 # Attempt to parse the unknown benchmark by flattening the 549 # given data and merging all the arrays of non-iterable 550 # data that fall under the same key. 551 # XXX Note that this is not fully implemented for the summary 552 # of the metric or test as we don't have a use case for that yet. 553 subtests, vals = self.parseUnknown(test) 554 555 if subtests is None: 556 raise Exception("No benchmark metrics found in browsertime results") 557 558 suite["subtests"] = subtests 559 560 self.add_additional_metrics(test, suite) 561 562 # summarize results for both benchmark type tests 563 if len(subtests) > 1: 564 suite["value"] = self.construct_summary(vals, testname=test["name"]) 565 subtests.sort(key=lambda subtest: subtest["name"]) 566 567 def summarize_suites(self, suites): 568 pass 569 570 def report_test_success(self): 571 if len(self.failed_tests) > 0: 572 LOG.warning("Some tests failed.") 573 if self.youtube_playback_failure: 574 for test in self.failed_tests: 575 LOG.warning("Youtube sub-test FAILED: %s" % test) 576 LOG.warning( 577 "Youtube playback sub-tests failed!!! " 578 "Not submitting results to perfherder!" 579 ) 580 return False 581 return True