tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dmd.py (35278B)


      1 #! /usr/bin/env python3
      2 #
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 """This script analyzes a JSON file emitted by DMD."""
      8 
      9 import argparse
     10 import collections
     11 import gzip
     12 import io
     13 import json
     14 import os
     15 import platform
     16 import re
     17 import shutil
     18 import sys
     19 import tempfile
     20 from bisect import bisect_right
     21 from functools import cmp_to_key
     22 from typing import Callable
     23 
     24 # The DMD output version this script handles.
     25 outputVersion = 5
     26 
     27 # If --ignore-alloc-fns is specified, stack frames containing functions that
     28 # match these strings will be removed from the *start* of stack traces. (Once
     29 # we hit a non-matching frame, any subsequent frames won't be removed even if
     30 # they do match.)
     31 allocatorFns = [
     32    # Matches malloc, replace_malloc, moz_xmalloc, vpx_malloc, js_malloc,
     33    # pod_malloc, malloc_zone_*, g_malloc.
     34    "malloc",
     35    # Matches calloc, replace_calloc, moz_xcalloc, vpx_calloc, js_calloc,
     36    # pod_calloc, malloc_zone_calloc, pod_callocCanGC.
     37    "calloc",
     38    # Matches realloc, replace_realloc, moz_xrealloc, vpx_realloc, js_realloc,
     39    # pod_realloc, pod_reallocCanGC.
     40    "realloc",
     41    # Matches memalign, posix_memalign, replace_memalign, replace_posix_memalign,
     42    # moz_xmemalign, vpx_memalign, malloc_zone_memalign.
     43    "memalign",
     44    "operator new(",
     45    "operator new[](",
     46    "g_slice_alloc",
     47    # This one is necessary to fully filter some sequences of allocation
     48    # functions that happen in practice. Note that ??? entries that follow
     49    # non-allocation functions won't be stripped, as explained above.
     50    "???",
     51    # Match DMD internals.
     52    "mozilla::dmd::AllocCallback",
     53    "mozilla::dmd::StackTrace::Get",
     54 ]
     55 
     56 
     57 def cmp(a, b):
     58    return (a > b) - (a < b)
     59 
     60 
     61 class Record:
     62    """A record is an aggregation of heap blocks that have identical stack
     63    traces. It can also be used to represent the difference between two
     64    records."""
     65 
     66    def __init__(self):
     67        self.numBlocks = 0
     68        self.reqSize = 0
     69        self.slopSize = 0
     70        self.usableSize = 0
     71        self.allocatedAtDesc = None
     72        self.reportedAtDescs = []
     73        self.usableSizes = collections.defaultdict(int)
     74        self.addrs = []
     75 
     76    def isZero(self, args):
     77        return (
     78            self.numBlocks == 0
     79            and self.reqSize == 0
     80            and self.slopSize == 0
     81            and self.usableSize == 0
     82            and len(self.usableSizes) == 0
     83        )
     84 
     85    def negate(self):
     86        self.numBlocks = -self.numBlocks
     87        self.reqSize = -self.reqSize
     88        self.slopSize = -self.slopSize
     89        self.usableSize = -self.usableSize
     90 
     91        negatedUsableSizes = collections.defaultdict(int)
     92        for usableSize, count in self.usableSizes.items():
     93            negatedUsableSizes[-usableSize] = count
     94        self.usableSizes = negatedUsableSizes
     95 
     96    def subtract(self, r):
     97        # We should only be calling this on records with matching stack traces.
     98        # Check this.
     99        assert self.allocatedAtDesc == r.allocatedAtDesc
    100        assert self.reportedAtDescs == r.reportedAtDescs
    101 
    102        self.numBlocks -= r.numBlocks
    103        self.reqSize -= r.reqSize
    104        self.slopSize -= r.slopSize
    105        self.usableSize -= r.usableSize
    106 
    107        usableSizes1 = self.usableSizes
    108        usableSizes2 = r.usableSizes
    109        usableSizes3 = collections.defaultdict(int)
    110        for usableSize in usableSizes1:
    111            counts1 = usableSizes1[usableSize]
    112            if usableSize in usableSizes2:
    113                counts2 = usableSizes2[usableSize]
    114                del usableSizes2[usableSize]
    115                counts3 = counts1 - counts2
    116                if counts3 != 0:
    117                    if counts3 < 0:
    118                        usableSize = -usableSize
    119                        counts3 = -counts3
    120                    usableSizes3[usableSize] = counts3
    121            else:
    122                usableSizes3[usableSize] = counts1
    123 
    124        for usableSize in usableSizes2:
    125            usableSizes3[-usableSize] = usableSizes2[usableSize]
    126 
    127        self.usableSizes = usableSizes3
    128 
    129    @staticmethod
    130    def cmpByUsableSize(r1, r2):
    131        # Sort by usable size, then by req size.
    132        return cmp(abs(r1.usableSize), abs(r2.usableSize)) or Record.cmpByReqSize(
    133            r1, r2
    134        )
    135 
    136    @staticmethod
    137    def cmpByReqSize(r1, r2):
    138        # Sort by req size.
    139        return cmp(abs(r1.reqSize), abs(r2.reqSize))
    140 
    141    @staticmethod
    142    def cmpBySlopSize(r1, r2):
    143        # Sort by slop size.
    144        return cmp(abs(r1.slopSize), abs(r2.slopSize))
    145 
    146    @staticmethod
    147    def cmpByNumBlocks(r1, r2):
    148        # Sort by block counts, then by usable size.
    149        return cmp(abs(r1.numBlocks), abs(r2.numBlocks)) or Record.cmpByUsableSize(
    150            r1, r2
    151        )
    152 
    153 
    154 sortByChoices = {
    155    "usable": Record.cmpByUsableSize,  # the default
    156    "req": Record.cmpByReqSize,
    157    "slop": Record.cmpBySlopSize,
    158    "num-blocks": Record.cmpByNumBlocks,
    159 }
    160 
    161 
    162 def parseCommandLine():
    163    # 24 is the maximum number of frames that DMD will produce.
    164    def range_1_24(string):
    165        value = int(string)
    166        if value < 1 or value > 24:
    167            msg = f"{string:s} is not in the range 1..24"
    168            raise argparse.ArgumentTypeError(msg)
    169        return value
    170 
    171    description = """
    172 Analyze heap data produced by DMD.
    173 If one file is specified, analyze it; if two files are specified, analyze the
    174 difference.
    175 Input files can be gzipped.
    176 Write to stdout unless -o/--output is specified.
    177 Stack traces are fixed to show function names, filenames and line numbers
    178 unless --no-fix-stacks is specified; stack fixing modifies the original file
    179 and may take some time. If specified, the BREAKPAD_SYMBOLS_PATH environment
    180 variable is used to find breakpad symbols for stack fixing.
    181 """
    182    p = argparse.ArgumentParser(description=description)
    183 
    184    p.add_argument(
    185        "-o",
    186        "--output",
    187        type=argparse.FileType("w"),
    188        help="output file; stdout if unspecified",
    189    )
    190 
    191    p.add_argument(
    192        "-f",
    193        "--max-frames",
    194        type=range_1_24,
    195        default=8,
    196        help="maximum number of frames to consider in each trace",
    197    )
    198 
    199    p.add_argument(
    200        "-s",
    201        "--sort-by",
    202        choices=sortByChoices.keys(),
    203        default="usable",
    204        help="sort the records by a particular metric",
    205    )
    206 
    207    p.add_argument(
    208        "-a",
    209        "--ignore-alloc-fns",
    210        action="store_true",
    211        help="ignore allocation functions at the start of traces",
    212    )
    213 
    214    p.add_argument("--no-fix-stacks", action="store_true", help="do not fix stacks")
    215 
    216    p.add_argument(
    217        "--clamp-contents",
    218        action="store_true",
    219        help="for a scan mode log, clamp addresses to the start of live blocks, "
    220        "or zero if not in one",
    221    )
    222 
    223    p.add_argument(
    224        "--print-clamp-stats",
    225        action="store_true",
    226        help="print information about the results of pointer clamping; mostly "
    227        "useful for debugging clamping",
    228    )
    229 
    230    p.add_argument(
    231        "--filter-stacks-for-testing",
    232        action="store_true",
    233        help="filter stack traces; only useful for testing purposes",
    234    )
    235 
    236    p.add_argument(
    237        "--filter",
    238        default=[],
    239        action="append",
    240        help="Only print entries that have a stack that matches the filter. "
    241        "A filter may be negated by prefixing it with `!`. "
    242        "If multiple filters are specified, all of them must match.",
    243    )
    244 
    245    p.add_argument("input_file", help="a file produced by DMD")
    246 
    247    p.add_argument(
    248        "input_file2",
    249        nargs="?",
    250        help="a file produced by DMD; if present, it is diff'd with input_file",
    251    )
    252 
    253    return p.parse_args(sys.argv[1:])
    254 
    255 
    256 # Fix stacks if necessary: first write the output to a tempfile, then replace
    257 # the original file with it.
    258 def fixStackTraces(inputFilename, isZipped, opener):
    259    # This append() call is needed to make the import statements work when this
    260    # script is installed as a symlink.
    261    sys.path.append(os.path.dirname(__file__))
    262 
    263    bpsyms = os.environ.get("BREAKPAD_SYMBOLS_PATH", None)
    264    sysname = platform.system()
    265    if bpsyms and os.path.exists(bpsyms):
    266        import fix_stacks as fixModule
    267 
    268        def fix(line):
    269            return fixModule.fixSymbols(line, jsonMode=True, breakpadSymsDir=bpsyms)
    270 
    271    elif sysname in ("Linux", "Darwin", "Windows"):
    272        import fix_stacks as fixModule
    273 
    274        def fix(line):
    275            return fixModule.fixSymbols(line, jsonMode=True)
    276 
    277    else:
    278        return
    279 
    280    # Fix stacks, writing output to a temporary file, and then overwrite the
    281    # original file.
    282    tmpFile = tempfile.NamedTemporaryFile(delete=False)
    283 
    284    # If the input is gzipped, then the output (written initially to |tmpFile|)
    285    # should be gzipped as well.
    286    #
    287    # And we want to set its pre-gzipped filename to '' rather than the name of
    288    # the temporary file, so that programs like the Unix 'file' utility don't
    289    # say that it was called 'tmp6ozTxE' (or something like that) before it was
    290    # zipped. So that explains the |filename=''| parameter.
    291    #
    292    # But setting the filename like that clobbers |tmpFile.name|, so we must
    293    # get that now in order to move |tmpFile| at the end.
    294    tmpFilename = tmpFile.name
    295    if isZipped:
    296        tmpFile = gzip.GzipFile(filename="", fileobj=tmpFile, mode="wb")
    297 
    298    with opener(inputFilename, "rb") as inputFile:
    299        for line in inputFile:
    300            tmpFile.write(fix(line))
    301 
    302    tmpFile.close()
    303 
    304    shutil.move(tmpFilename, inputFilename)
    305 
    306 
    307 def getDigestFromFile(args, inputFile):
    308    # Handle gzipped input if necessary.
    309    isZipped = inputFile.endswith(".gz")
    310    opener = gzip.open if isZipped else open
    311 
    312    # Fix stack traces unless otherwise instructed.
    313    if not args.no_fix_stacks:
    314        fixStackTraces(inputFile, isZipped, opener)
    315 
    316    if args.clamp_contents:
    317        clampBlockList(args, inputFile, isZipped, opener)
    318 
    319    with opener(inputFile, "rb") as f:
    320        j = json.load(f)
    321 
    322    if j["version"] != outputVersion:
    323        raise Exception(f"'version' property isn't '{outputVersion:d}'")
    324 
    325    # Extract the main parts of the JSON object.
    326    invocation = j["invocation"]
    327    dmdEnvVar = invocation["dmdEnvVar"]
    328    mode = invocation["mode"]
    329    blockList = j["blockList"]
    330    traceTable = j["traceTable"]
    331    frameTable = j["frameTable"]
    332 
    333    # Insert the necessary entries for unrecorded stack traces. Note that 'ut'
    334    # and 'uf' will not overlap with any keys produced by DMD's
    335    # ToIdStringConverter::Base32() function.
    336    unrecordedTraceID = "ut"
    337    unrecordedFrameID = "uf"
    338    traceTable[unrecordedTraceID] = [unrecordedFrameID]
    339    frameTable[unrecordedFrameID] = (
    340        "#00: (no stack trace recorded due to --stacks=partial)"
    341    )
    342 
    343    # For the purposes of this script, 'scan' behaves like 'live'.
    344    if mode == "scan":
    345        mode = "live"
    346 
    347    if mode not in ["live", "dark-matter", "cumulative"]:
    348        raise Exception(f"bad 'mode' property: '{mode:s}'")
    349 
    350    # Remove allocation functions at the start of traces.
    351    if args.ignore_alloc_fns:
    352        # Build a regexp that matches every function in allocatorFns.
    353        escapedAllocatorFns = map(re.escape, allocatorFns)
    354        fn_re = re.compile("|".join(escapedAllocatorFns))
    355 
    356        # Remove allocator fns from each stack trace.
    357        for traceKey, frameKeys in traceTable.items():
    358            numSkippedFrames = 0
    359            for frameKey in frameKeys:
    360                frameDesc = frameTable[frameKey]
    361                if re.search(fn_re, frameDesc):
    362                    numSkippedFrames += 1
    363                else:
    364                    break
    365            if numSkippedFrames > 0:
    366                traceTable[traceKey] = frameKeys[numSkippedFrames:]
    367 
    368    # Trim the number of frames.
    369    for traceKey, frameKeys in traceTable.items():
    370        if len(frameKeys) > args.max_frames:
    371            del frameKeys[args.max_frames :]
    372 
    373    def buildTraceDescription(traceTable, frameTable, traceKey):
    374        frameKeys = traceTable[traceKey]
    375        fmt = "    #{:02d}{:}"
    376 
    377        if args.filter_stacks_for_testing:
    378            # This option is used by `test_dmd.js`, which runs the code in
    379            # `SmokeDMD.cpp`. When running that test, there is too much
    380            # variation in the stack traces across different machines and
    381            # platforms to do exact output matching. However, every stack trace
    382            # should have at least three frames that contain `DMD` (in one of
    383            # `DMD.cpp`, `SmokeDMD.cpp`, `SmokeDMD`, or `SmokeDMD.exe`). Some
    384            # example frames from automation (where `..` indicates excised path
    385            # segments):
    386            #
    387            # Linux debug, with stack fixing using breakpad syms:
    388            # `#01: replace_realloc(void*, unsigned long) [../dmd/DMD.cpp:1110]`
    389            #
    390            # Linux opt, with native stack fixing:
    391            # `#02: TestFull(char const*, int, char const*, int) (../dmd/test/SmokeDMD.cpp:165)`
    392            #
    393            # Mac opt, with native stack fixing:
    394            # `#03: RunTests() (../build/tests/bin/SmokeDMD + 0x21f9)`
    395            #
    396            # Windows opt, with native stack fixing failing due to a missing PDB:
    397            # `#04: ??? (..\\build\\tests\\bin\\SmokeDMD.exe + 0x1c58)`
    398            #
    399            # If we see three such frames, we replace the entire stack trace
    400            # with a single, predictable frame. This imprecise matching will at
    401            # least detect if stack fixing fails completely.
    402            dmd_frame_matches = 0
    403            for frameKey in frameKeys:
    404                frameDesc = frameTable[frameKey]
    405                if "DMD" in frameDesc:
    406                    dmd_frame_matches += 1
    407                    if dmd_frame_matches >= 3:
    408                        return [fmt.format(1, ": ... DMD.cpp ...")]
    409 
    410        # The frame number is always '#00' (see DMD.h for why), so we have to
    411        # replace that with the correct frame number.
    412        desc = []
    413        for n, frameKey in enumerate(traceTable[traceKey], start=1):
    414            desc.append(fmt.format(n, frameTable[frameKey][3:]))
    415        return desc
    416 
    417    # Aggregate blocks into records. All sufficiently similar blocks go into a
    418    # single record.
    419 
    420    if mode in ["live", "cumulative"]:
    421        liveOrCumulativeRecords = collections.defaultdict(Record)
    422    elif mode == "dark-matter":
    423        unreportedRecords = collections.defaultdict(Record)
    424        onceReportedRecords = collections.defaultdict(Record)
    425        twiceReportedRecords = collections.defaultdict(Record)
    426 
    427    heapUsableSize = 0
    428    heapBlocks = 0
    429 
    430    recordKeyPartCache = {}
    431 
    432    for block in blockList:
    433        # For each block we compute a |recordKey|, and all blocks with the same
    434        # |recordKey| are aggregated into a single record. The |recordKey| is
    435        # derived from the block's 'alloc' and 'reps' (if present) stack
    436        # traces.
    437        #
    438        # We use frame descriptions (e.g. "#00: foo (X.cpp:99)") when comparing
    439        # traces for equality. We can't use trace keys or frame keys because
    440        # they're not comparable across different DMD runs (which is relevant
    441        # when doing diffs).
    442        #
    443        # Using frame descriptions also fits in with the stack trimming done
    444        # for --max-frames, which requires that stack traces with common
    445        # beginnings but different endings to be considered equivalent. E.g. if
    446        # we have distinct traces T1:[A:D1,B:D2,C:D3] and T2:[X:D1,Y:D2,Z:D4]
    447        # and we trim the final frame of each they should be considered
    448        # equivalent because the untrimmed frame descriptions (D1 and D2)
    449        # match.
    450        #
    451        # Having said all that, during a single invocation of dmd.py on a
    452        # single DMD file, for a single frameKey value the record key will
    453        # always be the same, and we might encounter it 1000s of times. So we
    454        # cache prior results for speed.
    455        def makeRecordKeyPart(traceKey):
    456            if traceKey in recordKeyPartCache:
    457                return recordKeyPartCache[traceKey]
    458 
    459            recordKeyPart = str(
    460                list(map(lambda frameKey: frameTable[frameKey], traceTable[traceKey]))
    461            )
    462            recordKeyPartCache[traceKey] = recordKeyPart
    463            return recordKeyPart
    464 
    465        allocatedAtTraceKey = block.get("alloc", unrecordedTraceID)
    466        if mode in ["live", "cumulative"]:
    467            recordKey = makeRecordKeyPart(allocatedAtTraceKey)
    468            records = liveOrCumulativeRecords
    469        elif mode == "dark-matter":
    470            recordKey = makeRecordKeyPart(allocatedAtTraceKey)
    471            if "reps" in block:
    472                reportedAtTraceKeys = block["reps"]
    473                for reportedAtTraceKey in reportedAtTraceKeys:
    474                    recordKey += makeRecordKeyPart(reportedAtTraceKey)
    475                if len(reportedAtTraceKeys) == 1:
    476                    records = onceReportedRecords
    477                else:
    478                    records = twiceReportedRecords
    479            else:
    480                records = unreportedRecords
    481 
    482        record = records[recordKey]
    483 
    484        if "req" not in block:
    485            raise Exception("'req' property missing in block'")
    486 
    487        reqSize = block["req"]
    488        slopSize = block.get("slop", 0)
    489 
    490        if "num" in block:
    491            num = block["num"]
    492        else:
    493            num = 1
    494 
    495        usableSize = reqSize + slopSize
    496        heapUsableSize += num * usableSize
    497        heapBlocks += num
    498 
    499        record.numBlocks += num
    500        record.reqSize += num * reqSize
    501        record.slopSize += num * slopSize
    502        record.usableSize += num * usableSize
    503        if record.allocatedAtDesc is None:
    504            record.allocatedAtDesc = buildTraceDescription(
    505                traceTable, frameTable, allocatedAtTraceKey
    506            )
    507 
    508        # In heap scan mode, we record the address of every block.
    509        if "addr" in block:
    510            record.addrs.append(block["addr"])
    511 
    512        if mode in ["live", "cumulative"]:
    513            pass
    514        elif mode == "dark-matter":
    515            if "reps" in block and record.reportedAtDescs == []:
    516 
    517                def f(k):
    518                    return buildTraceDescription(traceTable, frameTable, k)
    519 
    520                record.reportedAtDescs = list(map(f, reportedAtTraceKeys))
    521        record.usableSizes[usableSize] += num
    522 
    523    # All the processed data for a single DMD file is called a "digest".
    524    digest = {}
    525    digest["dmdEnvVar"] = dmdEnvVar
    526    digest["mode"] = mode
    527    digest["heapUsableSize"] = heapUsableSize
    528    digest["heapBlocks"] = heapBlocks
    529    if mode in ["live", "cumulative"]:
    530        digest["liveOrCumulativeRecords"] = liveOrCumulativeRecords
    531    elif mode == "dark-matter":
    532        digest["unreportedRecords"] = unreportedRecords
    533        digest["onceReportedRecords"] = onceReportedRecords
    534        digest["twiceReportedRecords"] = twiceReportedRecords
    535    return digest
    536 
    537 
    538 def diffRecords(args, records1, records2):
    539    records3 = {}
    540 
    541    # Process records1.
    542    for k in records1:
    543        r1 = records1[k]
    544        if k in records2:
    545            # This record is present in both records1 and records2.
    546            r2 = records2[k]
    547            del records2[k]
    548            r2.subtract(r1)
    549            if not r2.isZero(args):
    550                records3[k] = r2
    551        else:
    552            # This record is present only in records1.
    553            r1.negate()
    554            records3[k] = r1
    555 
    556    for k in records2:
    557        # This record is present only in records2.
    558        records3[k] = records2[k]
    559 
    560    return records3
    561 
    562 
    563 def diffDigests(args, d1, d2):
    564    if d1["mode"] != d2["mode"]:
    565        raise Exception("the input files have different 'mode' properties")
    566 
    567    d3 = {}
    568    d3["dmdEnvVar"] = (d1["dmdEnvVar"], d2["dmdEnvVar"])
    569    d3["mode"] = d1["mode"]
    570    d3["heapUsableSize"] = d2["heapUsableSize"] - d1["heapUsableSize"]
    571    d3["heapBlocks"] = d2["heapBlocks"] - d1["heapBlocks"]
    572    if d1["mode"] in ["live", "cumulative"]:
    573        d3["liveOrCumulativeRecords"] = diffRecords(
    574            args, d1["liveOrCumulativeRecords"], d2["liveOrCumulativeRecords"]
    575        )
    576    elif d1["mode"] == "dark-matter":
    577        d3["unreportedRecords"] = diffRecords(
    578            args, d1["unreportedRecords"], d2["unreportedRecords"]
    579        )
    580        d3["onceReportedRecords"] = diffRecords(
    581            args, d1["onceReportedRecords"], d2["onceReportedRecords"]
    582        )
    583        d3["twiceReportedRecords"] = diffRecords(
    584            args, d1["twiceReportedRecords"], d2["twiceReportedRecords"]
    585        )
    586    return d3
    587 
    588 
    589 def printDigest(args, digest):
    590    dmdEnvVar = digest["dmdEnvVar"]
    591    mode = digest["mode"]
    592    heapUsableSize = digest["heapUsableSize"]
    593    heapBlocks = digest["heapBlocks"]
    594    if mode in ["live", "cumulative"]:
    595        liveOrCumulativeRecords = digest["liveOrCumulativeRecords"]
    596    elif mode == "dark-matter":
    597        unreportedRecords = digest["unreportedRecords"]
    598        onceReportedRecords = digest["onceReportedRecords"]
    599        twiceReportedRecords = digest["twiceReportedRecords"]
    600 
    601    separator = "#" + "-" * 65 + "\n"
    602 
    603    def number(n):
    604        """Format a number with comma as a separator."""
    605        return f"{n:,d}"
    606 
    607    def perc(m, n):
    608        return 0 if n == 0 else (100 * m / n)
    609 
    610    def plural(n):
    611        return "" if n == 1 else "s"
    612 
    613    # Prints to stdout, or to file if -o/--output was specified.
    614    def out(*arguments, **kwargs):
    615        print(*arguments, file=args.output, **kwargs)
    616 
    617    def printStack(traceDesc):
    618        for frameDesc in traceDesc:
    619            out(frameDesc)
    620 
    621    def printRecords(recordKind, records, heapUsableSize):
    622        RecordKind = recordKind.capitalize()
    623        out(separator)
    624        numRecords = len(records)
    625        cmpRecords = sortByChoices[args.sort_by]
    626        sortedRecords = sorted(
    627            records.values(), key=cmp_to_key(cmpRecords), reverse=True
    628        )
    629        kindBlocks = 0
    630        kindUsableSize = 0
    631        maxRecord = 1000
    632 
    633        def is_match(rec: Record, key: str):
    634            return any(key in desc for desc in rec.allocatedAtDesc)
    635 
    636        for arg in args.filter:
    637            key: str
    638            cond: Callable[[Record], bool]
    639            if arg.startswith("\\"):
    640                # just in case you really need to start a filter with '!' (or '\')
    641                key = arg[1:]
    642                cond = is_match
    643            elif arg.startswith("!"):
    644                key = arg[1:]
    645 
    646                def cond(rec, key):
    647                    return not is_match(rec, key)  # noqa: E731
    648 
    649            else:
    650                key = arg
    651                cond = is_match
    652            sortedRecords = [rec for rec in sortedRecords if cond(rec, key)]
    653 
    654        # First iteration: get totals, etc.
    655        for record in sortedRecords:
    656            kindBlocks += record.numBlocks
    657            kindUsableSize += record.usableSize
    658 
    659        # Second iteration: print.
    660        if numRecords == 0:
    661            out(f"# no {recordKind} heap blocks\n")
    662 
    663        kindCumulativeUsableSize = 0
    664        for i, record in enumerate(sortedRecords, start=1):
    665            # Stop printing at the |maxRecord|th record.
    666            if i == maxRecord:
    667                out(f"# {RecordKind}: stopping after {i:,d} heap block records\n")
    668                break
    669 
    670            kindCumulativeUsableSize += record.usableSize
    671 
    672            out(RecordKind + " {")
    673            out(
    674                f"  {number(record.numBlocks)} block{plural(record.numBlocks)} in heap block record {i:,d} of {numRecords:,d}"
    675            )
    676            if record.addrs:
    677                if args.filter_stacks_for_testing:
    678                    # These addresses will vary, so for testing replace them with a fixed value.
    679                    baseAddrs = ["dadadada" for a in record.addrs]
    680                else:
    681                    baseAddrs = sorted(record.addrs)
    682                addrsString = ", ".join([f"0x{a}" for a in baseAddrs])
    683                out("  block addresses: " + addrsString)
    684            out(
    685                f"  {number(record.usableSize)} bytes ({number(record.reqSize)} requested / {number(record.slopSize)} slop)"
    686            )
    687 
    688            usableSizes = sorted(
    689                record.usableSizes.items(), key=lambda x: abs(x[0]), reverse=True
    690            )
    691            hasSingleBlock = len(usableSizes) == 1 and usableSizes[0][1] == 1
    692 
    693            if not hasSingleBlock:
    694                out("  Individual block sizes: ", end="")
    695                if len(usableSizes) == 0:
    696                    out("(no change)", end="")
    697                else:
    698                    isFirst = True
    699                    for usableSize, count in usableSizes:
    700                        if not isFirst:
    701                            out("; ", end="")
    702                        out(f"{number(usableSize)}", end="")
    703                        if count > 1:
    704                            out(f" x {count:,d}", end="")
    705                        isFirst = False
    706                out()
    707 
    708            out(
    709                f"  {perc(record.usableSize, heapUsableSize):4.2f}% of the heap ({perc(kindCumulativeUsableSize, heapUsableSize):4.2f}% cumulative)"
    710            )
    711            if mode in ["live", "cumulative"]:
    712                pass
    713            elif mode == "dark-matter":
    714                out(
    715                    f"  {perc(record.usableSize, kindUsableSize):4.2f}% of {recordKind} ({perc(kindCumulativeUsableSize, kindUsableSize):4.2f}% cumulative)"
    716                )
    717            out("  Allocated at {")
    718            printStack(record.allocatedAtDesc)
    719            out("  }")
    720            if mode in ["live", "cumulative"]:
    721                pass
    722            elif mode == "dark-matter":
    723                for n, reportedAtDesc in enumerate(record.reportedAtDescs):
    724                    again = "again " if n > 0 else ""
    725                    out(f"  Reported {again}at {{")
    726                    printStack(reportedAtDesc)
    727                    out("  }")
    728            out("}\n")
    729 
    730        return (kindUsableSize, kindBlocks)
    731 
    732    def printInvocation(n, dmdEnvVar, mode):
    733        out(f"Invocation{n} {{")
    734        if dmdEnvVar is None:
    735            out("  $DMD is undefined")
    736        else:
    737            out("  $DMD = '" + dmdEnvVar + "'")
    738        out("  Mode = '" + mode + "'")
    739        out("}\n")
    740 
    741    # Print command line. Strip dirs so the output is deterministic, which is
    742    # needed for testing.
    743    out(separator, end="")
    744    out("# " + " ".join(map(os.path.basename, sys.argv)) + "\n")
    745 
    746    # Print invocation(s).
    747    if type(dmdEnvVar) is not tuple:
    748        printInvocation("", dmdEnvVar, mode)
    749    else:
    750        printInvocation(" 1", dmdEnvVar[0], mode)
    751        printInvocation(" 2", dmdEnvVar[1], mode)
    752 
    753    # Print records.
    754    if mode in ["live", "cumulative"]:
    755        liveOrCumulativeUsableSize, liveOrCumulativeBlocks = printRecords(
    756            mode, liveOrCumulativeRecords, heapUsableSize
    757        )
    758    elif mode == "dark-matter":
    759        twiceReportedUsableSize, twiceReportedBlocks = printRecords(
    760            "twice-reported", twiceReportedRecords, heapUsableSize
    761        )
    762 
    763        unreportedUsableSize, unreportedBlocks = printRecords(
    764            "unreported", unreportedRecords, heapUsableSize
    765        )
    766 
    767        onceReportedUsableSize, onceReportedBlocks = printRecords(
    768            "once-reported", onceReportedRecords, heapUsableSize
    769        )
    770 
    771    # Print summary.
    772    out(separator)
    773    out("Summary {")
    774    if mode in ["live", "cumulative"]:
    775        out(
    776            f"  Total: {number(liveOrCumulativeUsableSize)} bytes in {number(liveOrCumulativeBlocks)} blocks"
    777        )
    778    elif mode == "dark-matter":
    779        fmt = "  {:15} {:>12} bytes ({:6.2f}%) in {:>7} blocks ({:6.2f}%)"
    780        out(fmt.format("Total:", number(heapUsableSize), 100, number(heapBlocks), 100))
    781        out(
    782            fmt.format(
    783                "Unreported:",
    784                number(unreportedUsableSize),
    785                perc(unreportedUsableSize, heapUsableSize),
    786                number(unreportedBlocks),
    787                perc(unreportedBlocks, heapBlocks),
    788            )
    789        )
    790        out(
    791            fmt.format(
    792                "Once-reported:",
    793                number(onceReportedUsableSize),
    794                perc(onceReportedUsableSize, heapUsableSize),
    795                number(onceReportedBlocks),
    796                perc(onceReportedBlocks, heapBlocks),
    797            )
    798        )
    799        out(
    800            fmt.format(
    801                "Twice-reported:",
    802                number(twiceReportedUsableSize),
    803                perc(twiceReportedUsableSize, heapUsableSize),
    804                number(twiceReportedBlocks),
    805                perc(twiceReportedBlocks, heapBlocks),
    806            )
    807        )
    808    out("}\n")
    809 
    810 
    811 #############################
    812 # Pretty printer for DMD JSON
    813 #############################
    814 
    815 
    816 def prettyPrintDmdJson(out, j):
    817    out.write("{\n")
    818 
    819    out.write(' "version": {},\n'.format(j["version"]))
    820    out.write(' "invocation": ')
    821    json.dump(j["invocation"], out, sort_keys=True)
    822    out.write(",\n")
    823 
    824    out.write(' "blockList": [')
    825    first = True
    826    for b in j["blockList"]:
    827        out.write("" if first else ",")
    828        out.write("\n  ")
    829        json.dump(b, out, sort_keys=True)
    830        first = False
    831    out.write("\n ],\n")
    832 
    833    out.write(' "traceTable": {')
    834    first = True
    835    for k, l in j["traceTable"].items():
    836        out.write("" if first else ",")
    837        out.write(f'\n  "{k}": {json.dumps(l)}')
    838        first = False
    839    out.write("\n },\n")
    840 
    841    out.write(' "frameTable": {')
    842    first = True
    843    for k, v in j["frameTable"].items():
    844        out.write("" if first else ",")
    845        out.write(f'\n  "{k}": {json.dumps(v)}')
    846        first = False
    847    out.write("\n }\n")
    848 
    849    out.write("}\n")
    850 
    851 
    852 ##################################################################
    853 # Code for clamping addresses using conservative pointer analysis.
    854 ##################################################################
    855 
    856 
    857 # Start is the address of the first byte of the block, while end is
    858 # the address of the first byte after the final byte in the block.
    859 class AddrRange:
    860    def __init__(self, block, length):
    861        self.block = block
    862        self.start = int(block, 16)
    863        self.length = length
    864        self.end = self.start + self.length
    865 
    866        assert self.start > 0
    867        assert length >= 0
    868 
    869 
    870 class ClampStats:
    871    def __init__(self):
    872        # Number of pointers already pointing to the start of a block.
    873        self.startBlockPtr = 0
    874 
    875        # Number of pointers pointing to the middle of a block. These
    876        # are clamped to the start of the block they point into.
    877        self.midBlockPtr = 0
    878 
    879        # Number of null pointers.
    880        self.nullPtr = 0
    881 
    882        # Number of non-null pointers that didn't point into the middle
    883        # of any blocks. These are clamped to null.
    884        self.nonNullNonBlockPtr = 0
    885 
    886    def clampedBlockAddr(self, sameAddress):
    887        if sameAddress:
    888            self.startBlockPtr += 1
    889        else:
    890            self.midBlockPtr += 1
    891 
    892    def nullAddr(self):
    893        self.nullPtr += 1
    894 
    895    def clampedNonBlockAddr(self):
    896        self.nonNullNonBlockPtr += 1
    897 
    898    def log(self):
    899        sys.stderr.write("Results:\n")
    900        sys.stderr.write(
    901            "  Number of pointers already pointing to start of blocks: "
    902            + str(self.startBlockPtr)
    903            + "\n"
    904        )
    905        sys.stderr.write(
    906            "  Number of pointers clamped to start of blocks: "
    907            + str(self.midBlockPtr)
    908            + "\n"
    909        )
    910        sys.stderr.write(
    911            "  Number of non-null pointers not pointing into blocks "
    912            "clamped to null: " + str(self.nonNullNonBlockPtr) + "\n"
    913        )
    914        sys.stderr.write("  Number of null pointers: " + str(self.nullPtr) + "\n")
    915 
    916 
    917 # Search the block ranges array for a block that address points into.
    918 # The search is carried out in an array of starting addresses for each blocks
    919 # because it is faster.
    920 def clampAddress(blockRanges, blockStarts, clampStats, address):
    921    i = bisect_right(blockStarts, address)
    922 
    923    # Any addresses completely out of the range should have been eliminated already.
    924    assert i > 0
    925    r = blockRanges[i - 1]
    926    assert r.start <= address
    927 
    928    if address >= r.end:
    929        assert address < blockRanges[i].start
    930        clampStats.clampedNonBlockAddr()
    931        return "0"
    932 
    933    clampStats.clampedBlockAddr(r.start == address)
    934    return r.block
    935 
    936 
    937 def clampBlockList(args, inputFileName, isZipped, opener):
    938    # XXX This isn't very efficient because we end up reading and writing
    939    # the file multiple times.
    940    with opener(inputFileName, "rb") as f:
    941        j = json.load(f)
    942 
    943    if j["version"] != outputVersion:
    944        raise Exception(f"'version' property isn't '{outputVersion:d}'")
    945 
    946    # Check that the invocation is reasonable for contents clamping.
    947    invocation = j["invocation"]
    948    if invocation["mode"] != "scan":
    949        raise Exception("Log was taken in mode " + invocation["mode"] + " not scan")
    950 
    951    sys.stderr.write("Creating block range list.\n")
    952    blockList = j["blockList"]
    953    blockRanges = []
    954    for block in blockList:
    955        blockRanges.append(AddrRange(block["addr"], block["req"]))
    956    blockRanges.sort(key=lambda r: r.start)
    957 
    958    # Make sure there are no overlapping blocks.
    959    prevRange = blockRanges[0]
    960    for currRange in blockRanges[1:]:
    961        assert prevRange.end <= currRange.start
    962        prevRange = currRange
    963 
    964    sys.stderr.write("Clamping block contents.\n")
    965    clampStats = ClampStats()
    966    firstAddr = blockRanges[0].start
    967    lastAddr = blockRanges[-1].end
    968 
    969    blockStarts = []
    970    for r in blockRanges:
    971        blockStarts.append(r.start)
    972 
    973    for block in blockList:
    974        # Small blocks don't have any contents.
    975        if "contents" not in block:
    976            continue
    977 
    978        cont = block["contents"]
    979        for i in range(len(cont)):
    980            address = int(cont[i], 16)
    981 
    982            if address == 0:
    983                clampStats.nullAddr()
    984                continue
    985 
    986            # If the address is before the first block or after the last
    987            # block then it can't be within a block.
    988            if address < firstAddr or address >= lastAddr:
    989                clampStats.clampedNonBlockAddr()
    990                cont[i] = "0"
    991                continue
    992 
    993            cont[i] = clampAddress(blockRanges, blockStarts, clampStats, address)
    994 
    995        # Remove any trailing nulls.
    996        while len(cont) and cont[-1] == "0":
    997            cont.pop()
    998 
    999    if args.print_clamp_stats:
   1000        clampStats.log()
   1001 
   1002    sys.stderr.write("Saving file.\n")
   1003    tmpFile = tempfile.NamedTemporaryFile(delete=False)
   1004    tmpFilename = tmpFile.name
   1005    if isZipped:
   1006        tmpFile = gzip.GzipFile(filename="", fileobj=tmpFile, mode="wb")
   1007    prettyPrintDmdJson(io.TextIOWrapper(tmpFile, encoding="utf-8"), j)
   1008    tmpFile.close()
   1009    shutil.move(tmpFilename, inputFileName)
   1010 
   1011 
   1012 def main():
   1013    args = parseCommandLine()
   1014    digest = getDigestFromFile(args, args.input_file)
   1015    if args.input_file2:
   1016        digest2 = getDigestFromFile(args, args.input_file2)
   1017        digest = diffDigests(args, digest, digest2)
   1018    printDigest(args, digest)
   1019 
   1020 
   1021 if __name__ == "__main__":
   1022    main()