tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

create_unwind_table.py (40120B)


      1 #!/usr/bin/env python3
      2 # Copyright 2021 The Chromium Authors
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 """Creates a table of unwind information in Android Chrome's bespoke format."""
      6 
      7 import abc
      8 import argparse
      9 import collections
     10 import enum
     11 import json
     12 import logging
     13 import re
     14 import struct
     15 import subprocess
     16 import sys
     17 from typing import (Dict, Iterable, List, NamedTuple, Sequence, TextIO, Tuple,
     18                    Union)
     19 
     20 from util import build_utils
     21 
     22 _STACK_CFI_INIT_REGEX = re.compile(
     23    r'^STACK CFI INIT ([0-9a-f]+) ([0-9a-f]+) (.+)$')
     24 _STACK_CFI_REGEX = re.compile(r'^STACK CFI ([0-9a-f]+) (.+)$')
     25 
     26 
     27 class AddressCfi(NamedTuple):
     28  """Record representing CFI for an address within a function.
     29 
     30  Represents the Call Frame Information required to unwind from an address in a
     31  function.
     32 
     33  Attributes:
     34      address: The address.
     35      unwind_instructions: The unwind instructions for the address.
     36 
     37  """
     38  address: int
     39  unwind_instructions: str
     40 
     41 
     42 class FunctionCfi(NamedTuple):
     43  """Record representing CFI for a function.
     44 
     45  Note: address_cfi[0].address is the start address of the function.
     46 
     47  Attributes:
     48      size: The function size in bytes.
     49      address_cfi: The CFI at each address in the function.
     50 
     51  """
     52  size: int
     53  address_cfi: Tuple[AddressCfi, ...]
     54 
     55 
     56 def FilterToNonTombstoneCfi(stream: TextIO) -> Iterable[str]:
     57  """Generates non-tombstone STACK CFI lines from the stream.
     58 
     59  STACK CFI functions with address 0 correspond are a 'tombstone' record
     60  associated with dead code and can be ignored. See
     61  https://bugs.llvm.org/show_bug.cgi?id=47148#c2.
     62 
     63  Args:
     64      stream: A file object.
     65 
     66  Returns:
     67      An iterable over the non-tombstone STACK CFI lines in the stream.
     68  """
     69  in_tombstone_function = False
     70  for line in stream:
     71    if not line.startswith('STACK CFI '):
     72      continue
     73 
     74    if line.startswith('STACK CFI INIT 0 '):
     75      in_tombstone_function = True
     76    elif line.startswith('STACK CFI INIT '):
     77      in_tombstone_function = False
     78 
     79    if not in_tombstone_function:
     80      yield line
     81 
     82 
     83 def ReadFunctionCfi(stream: TextIO) -> Iterable[FunctionCfi]:
     84  """Generates FunctionCfi records from the stream.
     85 
     86  Args:
     87      stream: A file object.
     88 
     89  Returns:
     90      An iterable over FunctionCfi corresponding to the non-tombstone STACK CFI
     91      lines in the stream.
     92  """
     93  current_function_address = None
     94  current_function_size = None
     95  current_function_address_cfi = []
     96  for line in FilterToNonTombstoneCfi(stream):
     97    cfi_init_match = _STACK_CFI_INIT_REGEX.search(line)
     98    if cfi_init_match:
     99      # Function CFI with address 0 are tombstone entries per
    100      # https://bugs.llvm.org/show_bug.cgi?id=47148#c2 and should have been
    101      # filtered in `FilterToNonTombstoneCfi`.
    102      assert current_function_address != 0
    103      if (current_function_address is not None
    104          and current_function_size is not None):
    105        yield FunctionCfi(current_function_size,
    106                          tuple(current_function_address_cfi))
    107      current_function_address = int(cfi_init_match.group(1), 16)
    108      current_function_size = int(cfi_init_match.group(2), 16)
    109      current_function_address_cfi = [
    110          AddressCfi(int(cfi_init_match.group(1), 16), cfi_init_match.group(3))
    111      ]
    112    else:
    113      cfi_match = _STACK_CFI_REGEX.search(line)
    114      assert cfi_match
    115      current_function_address_cfi.append(
    116          AddressCfi(int(cfi_match.group(1), 16), cfi_match.group(2)))
    117 
    118  assert current_function_address is not None
    119  assert current_function_size is not None
    120  yield FunctionCfi(current_function_size, tuple(current_function_address_cfi))
    121 
    122 
    123 def EncodeAsBytes(*values: int) -> bytes:
    124  """Encodes the argument ints as bytes.
    125 
    126  This function validates that the inputs are within the range that can be
    127  represented as bytes.
    128 
    129  Args:
    130    values: Integers in range [0, 255].
    131 
    132  Returns:
    133    The values encoded as bytes.
    134  """
    135  for i, value in enumerate(values):
    136    if not 0 <= value <= 255:
    137      raise ValueError('value = %d out of bounds at byte %d' % (value, i))
    138  return bytes(values)
    139 
    140 
    141 def Uleb128Encode(value: int) -> bytes:
    142  """Encodes the argument int to ULEB128 format.
    143 
    144  Args:
    145    value: Unsigned integer.
    146 
    147  Returns:
    148    The values encoded as ULEB128 bytes.
    149  """
    150  if value < 0:
    151    raise ValueError(f'Cannot uleb128 encode negative value ({value}).')
    152 
    153  uleb128_bytes = []
    154  done = False
    155  while not done:
    156    value, lowest_seven_bits = divmod(value, 0x80)
    157    done = value == 0
    158    uleb128_bytes.append(lowest_seven_bits | (0x80 if not done else 0x00))
    159  return EncodeAsBytes(*uleb128_bytes)
    160 
    161 
    162 def EncodeStackPointerUpdate(offset: int) -> bytes:
    163  """Encodes a stack pointer update as arm unwind instructions.
    164 
    165  Args:
    166    offset: Offset to apply on stack pointer. Should be in range [-0x204, inf).
    167 
    168  Returns:
    169    A list of arm unwind instructions as bytes.
    170  """
    171  assert offset % 4 == 0
    172 
    173  abs_offset = abs(offset)
    174  instruction_code = 0b01000000 if offset < 0 else 0b00000000
    175  if 0x04 <= abs_offset <= 0x200:
    176    instructions = [
    177        # vsp = vsp + (xxxxxx << 2) + 4. Covers range 0x04-0x100 inclusive.
    178        instruction_code | ((min(abs_offset, 0x100) - 4) >> 2)
    179    ]
    180    # For vsp increments of 0x104-0x200 we use 00xxxxxx twice.
    181    if abs_offset >= 0x104:
    182      instructions.append(instruction_code | ((abs_offset - 0x100 - 4) >> 2))
    183    try:
    184      return EncodeAsBytes(*instructions)
    185    except ValueError as e:
    186      raise RuntimeError('offset = %d produced out of range value' %
    187                         offset) from e
    188  else:
    189    # This only encodes positive sp movement.
    190    assert offset > 0, offset
    191    return EncodeAsBytes(0b10110010  # vsp = vsp + 0x204 + (uleb128 << 2)
    192                         ) + Uleb128Encode((offset - 0x204) >> 2)
    193 
    194 
    195 def EncodePop(registers: Sequence[int]) -> bytes:
    196  """Encodes popping of a sequence of registers as arm unwind instructions.
    197 
    198  Args:
    199    registers: Collection of target registers to accept values popped from
    200      stack. Register value order in the sequence does not matter. Values are
    201      popped based on register index order.
    202 
    203  Returns:
    204    A list of arm unwind instructions as bytes.
    205  """
    206  assert all(
    207      r in range(4, 16)
    208      for r in registers), f'Can only pop r4 ~ r15. Registers:\n{registers}.'
    209  assert len(registers) > 0, 'Register sequence cannot be empty.'
    210 
    211  instructions: List[int] = []
    212 
    213  # Check if the pushed registers are continuous set starting from r4 (and
    214  # ending prior to r12). This scenario has its own encoding.
    215  pop_lr = 14 in registers
    216  non_lr_registers = [r for r in registers if r != 14]
    217  non_lr_registers_continuous_from_r4 = \
    218    sorted(non_lr_registers) == list(range(4, 4 + len(non_lr_registers)))
    219 
    220  if (pop_lr and 0 < len(non_lr_registers) <= 8
    221      and non_lr_registers_continuous_from_r4):
    222    instructions.append(0b10101000
    223                        | (len(non_lr_registers) - 1)  # Pop r4-r[4+nnn], r14.
    224                        )
    225  else:
    226    register_bits = 0
    227    for register in registers:
    228      register_bits |= 1 << register
    229    register_bits = register_bits >> 4  # Skip r0 ~ r3.
    230    instructions.extend([
    231        # Pop up to 12 integer registers under masks {r15-r12}, {r11-r4}.
    232        0b10000000 | (register_bits >> 8),
    233        register_bits & 0xff
    234    ])
    235 
    236  return EncodeAsBytes(*instructions)
    237 
    238 
    239 class UnwindType(enum.Enum):
    240  """
    241  The type of unwind action to perform.
    242  """
    243 
    244  # Use lr as the return address.
    245  RETURN_TO_LR = 1
    246 
    247  # Increment or decrement the stack pointer and/or pop registers (r4 ~ r15).
    248  # If both, the increment/decrement occurs first.
    249  UPDATE_SP_AND_OR_POP_REGISTERS = 2
    250 
    251  # Restore the stack pointer from a register then increment/decrement the stack
    252  # pointer.
    253  RESTORE_SP_FROM_REGISTER = 3
    254 
    255  # No action necessary. Used for floating point register pops.
    256  NO_ACTION = 4
    257 
    258 
    259 class AddressUnwind(NamedTuple):
    260  """Record representing unwind information for an address within a function.
    261 
    262  Attributes:
    263      address_offset: The offset of the address from the start of the function.
    264      unwind_type: The type of unwind to perform from the address.
    265      sp_offset: The offset to apply to the stack pointer.
    266      registers: The registers involved in the unwind.
    267  """
    268  address_offset: int
    269  unwind_type: UnwindType
    270  sp_offset: int
    271  registers: Tuple[int, ...]
    272 
    273 
    274 class FunctionUnwind(NamedTuple):
    275  """Record representing unwind information for a function.
    276 
    277  Attributes:
    278      address: The address of the function.
    279      size: The function size in bytes.
    280      address_unwind_info: The unwind info at each address in the function.
    281  """
    282 
    283  address: int
    284  size: int
    285  address_unwinds: Tuple[AddressUnwind, ...]
    286 
    287 
    288 def EncodeAddressUnwind(address_unwind: AddressUnwind) -> bytes:
    289  """Encodes an `AddressUnwind` object as arm unwind instructions.
    290 
    291  Args:
    292    address_unwind: Record representing unwind information for an address within
    293      a function.
    294 
    295  Returns:
    296    A list of arm unwind instructions as bytes.
    297  """
    298  if address_unwind.unwind_type == UnwindType.RETURN_TO_LR:
    299    return EncodeAsBytes(0b10110000)  # Finish.
    300  if address_unwind.unwind_type == UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS:
    301    return ((EncodeStackPointerUpdate(address_unwind.sp_offset)
    302             if address_unwind.sp_offset else b'') +
    303            (EncodePop(address_unwind.registers)
    304             if address_unwind.registers else b''))
    305 
    306  if address_unwind.unwind_type == UnwindType.RESTORE_SP_FROM_REGISTER:
    307    assert len(address_unwind.registers) == 1
    308    return (EncodeAsBytes(0b10010000
    309                          | address_unwind.registers[0]  # Set vsp = r[nnnn].
    310                          ) +
    311            (EncodeStackPointerUpdate(address_unwind.sp_offset)
    312             if address_unwind.sp_offset else b''))
    313 
    314  if address_unwind.unwind_type == UnwindType.NO_ACTION:
    315    return b''
    316 
    317  assert False, 'unknown unwind type'
    318  return b''
    319 
    320 
    321 class UnwindInstructionsParser(abc.ABC):
    322  """Base class for parsers of breakpad unwind instruction sequences.
    323 
    324  Provides regexes matching breakpad instruction sequences understood by the
    325  parser, and parsing of the sequences from the regex match.
    326  """
    327 
    328  @abc.abstractmethod
    329  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
    330    pass
    331 
    332  @abc.abstractmethod
    333  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
    334                     match: re.Match) -> Tuple[AddressUnwind, int]:
    335    """Returns the regex matching the breakpad instructions.
    336 
    337    Args:
    338      address_offset: Offset from function start address.
    339      cfa_sp_offset: CFA stack pointer offset.
    340 
    341    Returns:
    342      The unwind info for the address plus the new cfa_sp_offset.
    343    """
    344 
    345 
    346 class NullParser(UnwindInstructionsParser):
    347  """Translates the state before any instruction has been executed."""
    348 
    349  regex = re.compile(r'^\.cfa: sp 0 \+ \.ra: lr$')
    350 
    351  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
    352    return self.regex
    353 
    354  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
    355                     match: re.Match) -> Tuple[AddressUnwind, int]:
    356    return AddressUnwind(address_offset, UnwindType.RETURN_TO_LR, 0, ()), 0
    357 
    358 
    359 class PushOrSubSpParser(UnwindInstructionsParser):
    360  """Translates unwinds from push or sub sp, #constant instructions."""
    361 
    362  # We expect at least one of the three outer groups to be non-empty. Cases:
    363  #
    364  # Standard prologue pushes.
    365  #   Match the first two and optionally the third.
    366  #
    367  # Standard prologue sub sp, #constant.
    368  #   Match only the first.
    369  #
    370  # Pushes in dynamic stack allocation functions after saving sp.
    371  #   Match only the third since they don't alter the stack pointer or store the
    372  #   return address.
    373  #
    374  # Leaf functions that use callee-save registers.
    375  #   Match the first and third but not the second.
    376  regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ ?)?'
    377                     r'(?:\.ra: \.cfa (-\d+) \+ \^ ?)?'
    378                     r'((?:r\d+: \.cfa -\d+ \+ \^ ?)*)$')
    379 
    380  # 'r' followed by digits, with 'r' matched via positive lookbehind so only the
    381  # number appears in the match.
    382  register_regex = re.compile('(?<=r)(\d+)')
    383 
    384  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
    385    return self.regex
    386 
    387  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
    388                     match: re.Match) -> Tuple[AddressUnwind, int]:
    389    # The group will be None if the outer non-capturing groups for the(\d+) and
    390    # (-\d+) expressions are not matched.
    391    new_cfa_sp_offset, ra_cfa_offset = (int(group) if group else None
    392                                        for group in match.groups()[:2])
    393 
    394    # Registers are pushed in reverse order by register number so are popped in
    395    # order. Sort them to ensure the proper order.
    396    registers = sorted([
    397        int(register)
    398        for register in self.register_regex.findall(match.group(3))
    399        # `UpdateSpAndOrPopRegisters` only supports popping of register
    400        # r4 ~ r15. The ignored registers are translated to sp increments by
    401        # the following calculation on `sp_offset`.
    402        if int(register) in range(4, 16)
    403    ] +
    404                       # Also pop lr (ra in breakpad terms) if it was stored.
    405                       ([14] if ra_cfa_offset is not None else []))
    406 
    407    sp_offset = 0
    408    if new_cfa_sp_offset is not None:
    409      sp_offset = new_cfa_sp_offset - cfa_sp_offset
    410      assert sp_offset % 4 == 0
    411      if sp_offset >= len(registers) * 4:
    412        # Handles the sub sp, #constant case, and push instructions that push
    413        # caller-save registers r0-r3 which don't get encoded in the unwind
    414        # instructions. In the latter case we need to move the stack pointer up
    415        # to the first pushed register.
    416        sp_offset -= len(registers) * 4
    417 
    418    return AddressUnwind(address_offset,
    419                         UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
    420                         tuple(registers)), new_cfa_sp_offset or cfa_sp_offset
    421 
    422 
    423 class VPushParser(UnwindInstructionsParser):
    424  # VPushes that occur in dynamic stack allocation functions after storing the
    425  # stack pointer don't change the stack pointer or push any register that we
    426  # care about. The first group will not match in those cases.
    427  #
    428  # Breakpad doesn't seem to understand how to name the floating point
    429  # registers so calls them unnamed_register.
    430  regex = re.compile(r'^(?:\.cfa: sp (\d+) \+ )?'
    431                     r'(?:unnamed_register\d+: \.cfa -\d+ \+ \^ ?)+$')
    432 
    433  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
    434    return self.regex
    435 
    436  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
    437                     match: re.Match) -> Tuple[AddressUnwind, int]:
    438    # `match.group(1)`, which corresponds to the (\d+) expression, will be None
    439    # if the first outer non-capturing group is not matched.
    440    new_cfa_sp_offset = int(match.group(1)) if match.group(1) else None
    441    if new_cfa_sp_offset is None:
    442      return (AddressUnwind(address_offset, UnwindType.NO_ACTION, 0,
    443                            ()), cfa_sp_offset)
    444 
    445    sp_offset = new_cfa_sp_offset - cfa_sp_offset
    446    assert sp_offset % 4 == 0
    447    return AddressUnwind(address_offset,
    448                         UnwindType.UPDATE_SP_AND_OR_POP_REGISTERS, sp_offset,
    449                         ()), new_cfa_sp_offset
    450 
    451 
    452 class StoreSpParser(UnwindInstructionsParser):
    453  regex = re.compile(r'^\.cfa: r(\d+) (\d+) \+$')
    454 
    455  def GetBreakpadInstructionsRegex(self) -> re.Pattern:
    456    return self.regex
    457 
    458  def ParseFromMatch(self, address_offset: int, cfa_sp_offset: int,
    459                     match: re.Match) -> Tuple[AddressUnwind, int]:
    460    register = int(match.group(1))
    461    new_cfa_sp_offset = int(match.group(2))
    462    sp_offset = new_cfa_sp_offset - cfa_sp_offset
    463    assert sp_offset % 4 == 0
    464    return AddressUnwind(address_offset, UnwindType.RESTORE_SP_FROM_REGISTER,
    465                         sp_offset, (register, )), new_cfa_sp_offset
    466 
    467 
    468 def EncodeUnwindInstructionTable(complete_instruction_sequences: Iterable[bytes]
    469                                 ) -> Tuple[bytes, Dict[bytes, int]]:
    470  """Encodes the unwind instruction table.
    471 
    472  Deduplicates the encoded unwind instruction sequences. Generates the table and
    473  a dictionary mapping a function to its starting index in the table.
    474 
    475  The instruction table is used by the unwinder to provide the sequence of
    476  unwind instructions to execute for each function, separated by offset
    477  into the function.
    478 
    479  Args:
    480    complete_instruction_sequences: An iterable of encoded unwind instruction
    481      sequences. The sequences represent the series of unwind instructions to
    482      execute corresponding to offsets within each function.
    483 
    484  Returns:
    485    A tuple containing:
    486    - The unwind instruction table as bytes.
    487    - The mapping from the instruction sequence to the offset in the unwind
    488      instruction table. This mapping is used to construct the function offset
    489      table, which references entries in the unwind instruction table.
    490  """
    491  # As the function offset table uses variable length number encoding (uleb128),
    492  # which means smaller number uses fewer bytes to represent, we should sort
    493  # the unwind instruction table by number of references from the function
    494  # offset table in order to minimize the size of the function offset table.
    495  ref_counts: Dict[bytes, int] = collections.defaultdict(int)
    496  for sequence in complete_instruction_sequences:
    497    ref_counts[sequence] += 1
    498 
    499  def ComputeScore(sequence):
    500    """ Score for each sequence is computed as  ref_count / size_of_sequence.
    501 
    502    According to greedy algorithm, items with higher value / space cost ratio
    503    should be prioritized. Here value is bytes saved in the function offset
    504    table, represetned by ref_count. Space cost is the space taken in the
    505    unwind instruction table, represented by size_of_sequence.
    506 
    507    Note: In order to ensure build-time determinism, `sequence` is also returned
    508    to resolve sorting order when scores are the same.
    509    """
    510    return ref_counts[sequence] / len(sequence), sequence
    511 
    512  ordered_sequences = sorted(ref_counts.keys(), key=ComputeScore, reverse=True)
    513  offsets: Dict[bytes, int] = {}
    514  current_offset = 0
    515  for sequence in ordered_sequences:
    516    offsets[sequence] = current_offset
    517    current_offset += len(sequence)
    518  return b''.join(ordered_sequences), offsets
    519 
    520 
    521 class EncodedAddressUnwind(NamedTuple):
    522  """Record representing unwind information for an address within a function.
    523 
    524  This structure represents the same concept as `AddressUnwind`. The only
    525  difference is that how to unwind from the address is represented as
    526  encoded ARM unwind instructions.
    527 
    528  Attributes:
    529    address_offset: The offset of the address from the start address of the
    530      function.
    531    complete_instruction_sequence: The full ARM unwind instruction sequence to
    532      unwind from the `address_offset`.
    533  """
    534  address_offset: int
    535  complete_instruction_sequence: bytes
    536 
    537 
    538 def EncodeAddressUnwinds(address_unwinds: Tuple[AddressUnwind, ...]
    539                         ) -> Tuple[EncodedAddressUnwind, ...]:
    540  """Encodes the unwind instructions and offset for the addresses within a
    541  function.
    542 
    543  Args:
    544    address_unwinds: A tuple of unwind state for addresses within a function.
    545 
    546  Returns:
    547    The encoded unwind instructions and offsets for the addresses within a
    548    function, ordered by decreasing offset.
    549  """
    550  sorted_address_unwinds: List[AddressUnwind] = sorted(
    551      address_unwinds,
    552      key=lambda address_unwind: address_unwind.address_offset,
    553      reverse=True)
    554  unwind_instructions: List[bytes] = [
    555      EncodeAddressUnwind(address_unwind)
    556      for address_unwind in sorted_address_unwinds
    557  ]
    558 
    559  # A complete instruction sequence contains all the unwind instructions
    560  # necessary to unwind from an offset within a function. For a given offset
    561  # this includes the offset's instructions plus the instructions for all
    562  # earlier offsets. The offsets are stored in reverse order, hence the i:
    563  # range rather than :i+1.
    564  complete_instruction_sequences = [
    565      b''.join(unwind_instructions[i:]) for i in range(len(unwind_instructions))
    566  ]
    567 
    568  encoded_unwinds: List[EncodedAddressUnwind] = []
    569  for address_unwind, sequence in zip(sorted_address_unwinds,
    570                                      complete_instruction_sequences):
    571    encoded_unwinds.append(
    572        EncodedAddressUnwind(address_unwind.address_offset, sequence))
    573  return tuple(encoded_unwinds)
    574 
    575 
    576 class EncodedFunctionUnwind(NamedTuple):
    577  """Record representing unwind information for a function.
    578 
    579  This structure represents the same concept as `FunctionUnwind`, but with
    580  some differences:
    581  - Attribute `address` is split into 2 attributes: `page_number` and
    582    `page_offset`.
    583  - Attribute `size` is dropped.
    584  - Attribute `address_unwinds` becomes a collection of `EncodedAddressUnwind`s,
    585    instead of a collection of `AddressUnwind`s.
    586 
    587  Attributes:
    588    page_number: The upper bits (17 ~ 31bits) of byte offset from text section
    589      start.
    590    page_offset: The lower bits (1 ~ 16bits) of instruction offset from text
    591      section start.
    592    address_unwinds: A collection of `EncodedAddressUnwind`s.
    593 
    594  """
    595 
    596  page_number: int
    597  page_offset: int
    598  address_unwinds: Tuple[EncodedAddressUnwind, ...]
    599 
    600 
    601 # The trivial unwind is defined as a single `RETURN_TO_LR` instruction
    602 # at the start of the function.
    603 TRIVIAL_UNWIND: Tuple[EncodedAddressUnwind, ...] = EncodeAddressUnwinds(
    604    (AddressUnwind(address_offset=0,
    605                   unwind_type=UnwindType.RETURN_TO_LR,
    606                   sp_offset=0,
    607                   registers=()), ))
    608 
    609 # The refuse to unwind filler unwind is used to fill the invalid space
    610 # before the first function in the first page and after the last function
    611 # in the last page.
    612 REFUSE_TO_UNWIND: Tuple[EncodedAddressUnwind, ...] = (EncodedAddressUnwind(
    613    address_offset=0,
    614    complete_instruction_sequence=bytes([0b10000000, 0b00000000])), )
    615 
    616 
    617 def EncodeFunctionUnwinds(function_unwinds: Iterable[FunctionUnwind],
    618                          text_section_start_address: int
    619                          ) -> Iterable[EncodedFunctionUnwind]:
    620  """Encodes the unwind state for all functions defined in the binary.
    621 
    622  This function
    623  - sorts the collection of `FunctionUnwind`s by address.
    624  - fills in gaps between functions with trivial unwind.
    625  - fills the space in the last page after last function with refuse to unwind.
    626  - fills the space in the first page before the first function with refuse
    627    to unwind.
    628 
    629  Args:
    630    function_unwinds: An iterable of function unwind states.
    631    text_section_start_address: The address of .text section in ELF file.
    632 
    633  Returns:
    634    The encoded function unwind states with no gaps between functions, ordered
    635    by ascending address.
    636  """
    637 
    638  def GetPageNumber(address: int) -> int:
    639    """Calculates the page number.
    640 
    641    Page number is calculated as byte_offset_from_text_section_start >> 17,
    642    i.e. the upper bits (17 ~ 31bits) of byte offset from text section start.
    643    """
    644    return (address - text_section_start_address) >> 17
    645 
    646  def GetPageOffset(address: int) -> int:
    647    """Calculates the page offset.
    648 
    649    Page offset is calculated as (byte_offset_from_text_section_start >> 1)
    650    & 0xffff, i.e. the lower bits (1 ~ 16bits) of instruction offset from
    651    text section start.
    652    """
    653    return ((address - text_section_start_address) >> 1) & 0xffff
    654 
    655  sorted_function_unwinds: List[FunctionUnwind] = sorted(
    656      function_unwinds, key=lambda function_unwind: function_unwind.address)
    657 
    658  if sorted_function_unwinds[0].address > text_section_start_address:
    659    yield EncodedFunctionUnwind(page_number=0,
    660                                page_offset=0,
    661                                address_unwinds=REFUSE_TO_UNWIND)
    662 
    663  prev_func_end_address: int = sorted_function_unwinds[0].address
    664 
    665  gaps = 0
    666  for unwind in sorted_function_unwinds:
    667    assert prev_func_end_address <= unwind.address, (
    668        'Detected overlap between functions.')
    669 
    670    if prev_func_end_address < unwind.address:
    671      # Gaps between functions are typically filled by regions of thunks which
    672      # do not alter the stack pointer. Filling these gaps with TRIVIAL_UNWIND
    673      # is the appropriate unwind strategy.
    674      gaps += 1
    675      yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
    676                                  GetPageOffset(prev_func_end_address),
    677                                  TRIVIAL_UNWIND)
    678 
    679    yield EncodedFunctionUnwind(GetPageNumber(unwind.address),
    680                                GetPageOffset(unwind.address),
    681                                EncodeAddressUnwinds(unwind.address_unwinds))
    682 
    683    prev_func_end_address = unwind.address + unwind.size
    684 
    685  if GetPageOffset(prev_func_end_address) != 0:
    686    yield EncodedFunctionUnwind(GetPageNumber(prev_func_end_address),
    687                                GetPageOffset(prev_func_end_address),
    688                                REFUSE_TO_UNWIND)
    689 
    690  logging.info('%d/%d gaps between functions filled with trivial unwind.', gaps,
    691               len(sorted_function_unwinds))
    692 
    693 
    694 def EncodeFunctionOffsetTable(
    695    encoded_address_unwind_sequences: Iterable[
    696        Tuple[EncodedAddressUnwind, ...]],
    697    unwind_instruction_table_offsets: Dict[bytes, int]
    698 ) -> Tuple[bytes, Dict[Tuple[EncodedAddressUnwind, ...], int]]:
    699  """Encodes the function offset table.
    700 
    701  The function offset table maps local instruction offset from function
    702  start to the location in the unwind instruction table.
    703 
    704  Args:
    705    encoded_address_unwind_sequences: An iterable of encoded address unwind
    706      sequences.
    707    unwind_instruction_table_offsets: The offset mapping returned from
    708      `EncodeUnwindInstructionTable`.
    709 
    710  Returns:
    711    A tuple containing:
    712    - The function offset table as bytes.
    713    - The mapping from the `EncodedAddressUnwind`s to the offset in the function
    714      offset table. This mapping is used to construct the function table, which
    715      references entries in the function offset table.
    716  """
    717  function_offset_table = bytearray()
    718  offsets: Dict[Tuple[EncodedAddressUnwind, ...], int] = {}
    719 
    720  for sequence in encoded_address_unwind_sequences:
    721    if sequence in offsets:
    722      continue
    723 
    724    offsets[sequence] = len(function_offset_table)
    725    for address_offset, complete_instruction_sequence in sequence:
    726      # Note: address_offset is the number of bytes from one address to another,
    727      # while the instruction_offset is the number of 2-byte instructions
    728      # from one address to another.
    729      instruction_offset = address_offset >> 1
    730      function_offset_table += (
    731          Uleb128Encode(instruction_offset) + Uleb128Encode(
    732              unwind_instruction_table_offsets[complete_instruction_sequence]))
    733 
    734  return bytes(function_offset_table), offsets
    735 
    736 
    737 def EncodePageTableAndFunctionTable(
    738    function_unwinds: Iterable[EncodedFunctionUnwind],
    739    function_offset_table_offsets: Dict[Tuple[EncodedAddressUnwind, ...], int]
    740 ) -> Tuple[bytes, bytes]:
    741  """Encode page table and function table as bytes.
    742 
    743  Page table:
    744  A table that contains the mapping from page_number to the location of the
    745  entry for the first function on the page in the function table.
    746 
    747  Function table:
    748  A table that contains the mapping from page_offset to the location of an entry
    749  in the function offset table.
    750 
    751  Args:
    752    function_unwinds: All encoded function unwinds in the module.
    753    function_offset_table_offsets: The offset mapping returned from
    754      `EncodeFunctionOffsetTable`.
    755 
    756  Returns:
    757    A tuple containing:
    758    - The page table as bytes.
    759    - The function table as bytes.
    760  """
    761  page_function_unwinds: Dict[
    762      int, List[EncodedFunctionUnwind]] = collections.defaultdict(list)
    763  for function_unwind in function_unwinds:
    764    page_function_unwinds[function_unwind.page_number].append(function_unwind)
    765 
    766  raw_page_table: List[int] = []
    767  function_table = bytearray()
    768 
    769  for page_number, same_page_function_unwinds in sorted(
    770      page_function_unwinds.items(), key=lambda item: item[0]):
    771    # Pad empty pages.
    772    # Empty pages can occur when a function spans over multiple pages.
    773    # Example:
    774    # A page table with a starting function that spans 3 over pages.
    775    # page_table:
    776    # [0, 1, 1, 1]
    777    # function_table:
    778    # [
    779    #   # Page 0
    780    #   (0, 20) # This function spans from page 0 offset 0 to page 3 offset 5.
    781    #   # Page 1 is empty.
    782    #   # Page 2 is empty.
    783    #   # Page 3
    784    #   (6, 70)
    785    # ]
    786    assert page_number > len(raw_page_table) - 1
    787    number_of_empty_pages = page_number - len(raw_page_table)
    788    # The function table is represented as `base::FunctionTableEntry[]`,
    789    # where `base::FunctionTableEntry` is 4 bytes.
    790    function_table_index = len(function_table) // 4
    791    raw_page_table.extend([function_table_index] * (number_of_empty_pages + 1))
    792    assert page_number == len(raw_page_table) - 1
    793 
    794    for function_unwind in sorted(
    795        same_page_function_unwinds,
    796        key=lambda function_unwind: function_unwind.page_offset):
    797      function_table += struct.pack(
    798          'HH', function_unwind.page_offset,
    799          function_offset_table_offsets[function_unwind.address_unwinds])
    800 
    801  page_table = struct.pack(f'{len(raw_page_table)}I', *raw_page_table)
    802 
    803  return page_table, bytes(function_table)
    804 
    805 
    806 ALL_PARSERS: Tuple[UnwindInstructionsParser, ...] = (
    807    NullParser(),
    808    PushOrSubSpParser(),
    809    StoreSpParser(),
    810    VPushParser(),
    811 )
    812 
    813 
    814 def ParseAddressCfi(address_cfi: AddressCfi, function_start_address: int,
    815                    parsers: Tuple[UnwindInstructionsParser, ...],
    816                    prev_cfa_sp_offset: int
    817                    ) -> Tuple[Union[AddressUnwind, None], bool, int]:
    818  """Parses address CFI with given parsers.
    819 
    820  Args:
    821    address_cfi: The CFI for an address in the function.
    822    function_start_address: The start address of the function.
    823    parsers: Available parsers to try on CFI data.
    824    prev_cfa_sp_offset: Previous CFA stack pointer offset.
    825 
    826  Returns:
    827    A tuple containing:
    828    - An `AddressUnwind` object when the parse is successful, None otherwise.
    829    - Whether the address is in function epilogue.
    830    - The new cfa_sp_offset.
    831  """
    832  for parser in parsers:
    833    match = parser.GetBreakpadInstructionsRegex().search(
    834        address_cfi.unwind_instructions)
    835    if not match:
    836      continue
    837 
    838    address_unwind, cfa_sp_offset = parser.ParseFromMatch(
    839        address_cfi.address - function_start_address, prev_cfa_sp_offset, match)
    840 
    841    in_epilogue = (
    842        prev_cfa_sp_offset > cfa_sp_offset
    843        and address_unwind.unwind_type != UnwindType.RESTORE_SP_FROM_REGISTER)
    844 
    845    return (address_unwind if not in_epilogue else None, in_epilogue,
    846            cfa_sp_offset)
    847 
    848  return None, False, prev_cfa_sp_offset
    849 
    850 
    851 def GenerateUnwinds(function_cfis: Iterable[FunctionCfi],
    852                    parsers: Tuple[UnwindInstructionsParser, ...]
    853                    ) -> Iterable[FunctionUnwind]:
    854  """Generates parsed function unwind states from breakpad CFI data.
    855 
    856  This function parses `FunctionCfi`s to `FunctionUnwind`s using
    857  `UnwindInstructionParser`.
    858 
    859  Args:
    860    function_cfis: An iterable of function CFI data.
    861    parsers: Available parsers to try on CFI address data.
    862 
    863  Returns:
    864    An iterable of parsed function unwind states.
    865  """
    866  functions = 0
    867  addresses = 0
    868  handled_addresses = 0
    869  epilogues_seen = 0
    870 
    871  for function_cfi in function_cfis:
    872    functions += 1
    873    address_unwinds: List[AddressUnwind] = []
    874    cfa_sp_offset = 0
    875    for address_cfi in function_cfi.address_cfi:
    876      addresses += 1
    877 
    878      address_unwind, in_epilogue, cfa_sp_offset = ParseAddressCfi(
    879          address_cfi, function_cfi.address_cfi[0].address, parsers,
    880          cfa_sp_offset)
    881 
    882      if address_unwind:
    883        handled_addresses += 1
    884        address_unwinds.append(address_unwind)
    885        continue
    886 
    887      if in_epilogue:
    888        epilogues_seen += 1
    889        break
    890 
    891      logging.info('unrecognized CFI: %x %s.', address_cfi.address,
    892                   address_cfi.unwind_instructions)
    893 
    894    if address_unwinds:
    895      # We expect that the unwind information for every function starts with a
    896      # trivial unwind (RETURN_TO_LR) prior to the execution of any code in the
    897      # function. This is required by the arm calling convention which involves
    898      # setting lr to the return address on calling into a function.
    899      assert address_unwinds[0].address_offset == 0
    900      assert address_unwinds[0].unwind_type == UnwindType.RETURN_TO_LR
    901 
    902      yield FunctionUnwind(function_cfi.address_cfi[0].address,
    903                           function_cfi.size, tuple(address_unwinds))
    904 
    905  logging.info('%d functions.', functions)
    906  logging.info('%d/%d addresses handled.', handled_addresses, addresses)
    907  logging.info('epilogues_seen: %d.', epilogues_seen)
    908 
    909 
    910 def EncodeUnwindInfo(page_table: bytes, function_table: bytes,
    911                     function_offset_table: bytes,
    912                     unwind_instruction_table: bytes) -> bytes:
    913  """Encodes all unwind tables as a single binary.
    914 
    915  Concats all unwind table binaries together and attach a header at the start
    916  with a offset-size pair for each table.
    917 
    918  offset: The offset to the target table from the start of the unwind info
    919    binary in bytes.
    920  size: The declared size of the target table.
    921 
    922  Both offset and size are represented as 32bit integers.
    923  See `base::ChromeUnwindInfoHeaderAndroid` for more details.
    924 
    925  Args:
    926    page_table: The page table as bytes.
    927    function_table: The function table as bytes.
    928    function_offset_table: The function offset table as bytes.
    929    unwind_instruction_table: The unwind instruction table as bytes.
    930 
    931  Returns:
    932    A single binary containing
    933    - A header that points to the location of each table.
    934    - All unwind tables.
    935  """
    936  unwind_info_header = bytearray()
    937  # Each table is represented as (offset, size) pair, both offset and size
    938  # are represented as 4 byte integer.
    939  unwind_info_header_size = 4 * 2 * 4
    940  unwind_info_body = bytearray()
    941 
    942  # Both the page_table and the function table need to be aligned because their
    943  # contents are interpreted as multi-byte integers. However, the byte size of
    944  # the header, the page table, the function table are all multiples of 4 and
    945  # the resource will be memory mapped at a 4 byte boundary, so no extra care
    946  # is required to align the page table and the function table.
    947  #
    948  # The function offset table and the unwind instruction table are accessed
    949  # byte by byte, so they only need 1 byte alignment.
    950 
    951  assert len(page_table) % 4 == 0, (
    952      'Each entry in the page table should be 4-byte integer.')
    953  assert len(function_table) % 4 == 0, (
    954      'Each entry in the function table should be a pair of 2 2-byte integers.')
    955 
    956  for table in page_table, function_table:
    957    offset = unwind_info_header_size + len(unwind_info_body)
    958    # For the page table and the function_table, declared size is the number of
    959    # entries in each table. The tables will be aligned to a 4 byte boundary
    960    # because the resource will be memory mapped at a 4 byte boundary and the
    961    # header is a multiple of 4 bytes.
    962    declared_size = len(table) // 4
    963    unwind_info_header += struct.pack('II', offset, declared_size)
    964    unwind_info_body += table
    965 
    966  for table in function_offset_table, unwind_instruction_table:
    967    offset = unwind_info_header_size + len(unwind_info_body)
    968    # Because both the function offset table and the unwind instruction table
    969    # contain variable length encoded numbers, the declared size is simply the
    970    # number of bytes in each table. The tables only require 1 byte alignment.
    971    declared_size = len(table)
    972    unwind_info_header += struct.pack('II', offset, declared_size)
    973    unwind_info_body += table
    974 
    975  return bytes(unwind_info_header + unwind_info_body)
    976 
    977 
    978 def GenerateUnwindTables(
    979    encoded_function_unwinds_iterable: Iterable[EncodedFunctionUnwind]
    980 ) -> Tuple[bytes, bytes, bytes, bytes]:
    981  """Generates all unwind tables as bytes.
    982 
    983  Args:
    984    encoded_function_unwinds_iterable: Encoded function unwinds for all
    985      functions in the ELF binary.
    986 
    987  Returns:
    988    A tuple containing:
    989    - The page table as bytes.
    990    - The function table as bytes.
    991    - The function offset table as bytes.
    992    - The unwind instruction table as bytes.
    993  """
    994  encoded_function_unwinds: List[EncodedFunctionUnwind] = list(
    995      encoded_function_unwinds_iterable)
    996  complete_instruction_sequences: List[bytes] = []
    997  encoded_address_unwind_sequences: List[Tuple[EncodedAddressUnwind, ...]] = []
    998 
    999  for encoded_function_unwind in encoded_function_unwinds:
   1000    encoded_address_unwind_sequences.append(
   1001        encoded_function_unwind.address_unwinds)
   1002    for address_unwind in encoded_function_unwind.address_unwinds:
   1003      complete_instruction_sequences.append(
   1004          address_unwind.complete_instruction_sequence)
   1005 
   1006  unwind_instruction_table, unwind_instruction_table_offsets = (
   1007      EncodeUnwindInstructionTable(complete_instruction_sequences))
   1008 
   1009  function_offset_table, function_offset_table_offsets = (
   1010      EncodeFunctionOffsetTable(encoded_address_unwind_sequences,
   1011                                unwind_instruction_table_offsets))
   1012 
   1013  page_table, function_table = EncodePageTableAndFunctionTable(
   1014      encoded_function_unwinds, function_offset_table_offsets)
   1015 
   1016  return (page_table, function_table, function_offset_table,
   1017          unwind_instruction_table)
   1018 
   1019 
   1020 def ReadTextSectionStartAddress(readobj_path: str, libchrome_path: str) -> int:
   1021  """Reads the .text section start address of libchrome ELF.
   1022 
   1023  Arguments:
   1024    readobj_path: Path to llvm-obj binary.
   1025    libchrome_path: Path to libchrome binary.
   1026 
   1027  Returns:
   1028    The text section start address as a number.
   1029  """
   1030  def GetSectionName(section) -> str:
   1031    # See crbug.com/1426287 for context on different JSON names.
   1032    if 'Name' in section['Section']['Name']:
   1033      return section['Section']['Name']['Name']
   1034    return section['Section']['Name']['Value']
   1035 
   1036  proc = subprocess.Popen(
   1037      [readobj_path, '--sections', '--elf-output-style=JSON', libchrome_path],
   1038      stdout=subprocess.PIPE,
   1039      encoding='ascii')
   1040 
   1041  elfs = json.loads(proc.stdout.read())[0]
   1042  sections = elfs['Sections']
   1043 
   1044  return next(s['Section']['Address'] for s in sections
   1045              if GetSectionName(s) == '.text')
   1046 
   1047 
   1048 def main():
   1049  build_utils.InitLogging('CREATE_UNWIND_TABLE_DEBUG')
   1050  parser = argparse.ArgumentParser(description=__doc__)
   1051  parser.add_argument('--input_path',
   1052                      help='Path to the unstripped binary.',
   1053                      required=True,
   1054                      metavar='FILE')
   1055  parser.add_argument('--output_path',
   1056                      help='Path to unwind info binary output.',
   1057                      required=True,
   1058                      metavar='FILE')
   1059  parser.add_argument('--dump_syms_path',
   1060                      required=True,
   1061                      help='The path of the dump_syms binary.',
   1062                      metavar='FILE')
   1063  parser.add_argument('--readobj_path',
   1064                      required=True,
   1065                      help='The path of the llvm-readobj binary.',
   1066                      metavar='FILE')
   1067 
   1068  args = parser.parse_args()
   1069  proc = subprocess.Popen(['./' + args.dump_syms_path, args.input_path, '-v'],
   1070                          stdout=subprocess.PIPE,
   1071                          encoding='ascii')
   1072 
   1073  function_cfis = ReadFunctionCfi(proc.stdout)
   1074  function_unwinds = GenerateUnwinds(function_cfis, parsers=ALL_PARSERS)
   1075  encoded_function_unwinds = EncodeFunctionUnwinds(
   1076      function_unwinds,
   1077      ReadTextSectionStartAddress(args.readobj_path, args.input_path))
   1078  (page_table, function_table, function_offset_table,
   1079   unwind_instruction_table) = GenerateUnwindTables(encoded_function_unwinds)
   1080  unwind_info: bytes = EncodeUnwindInfo(page_table, function_table,
   1081                                        function_offset_table,
   1082                                        unwind_instruction_table)
   1083 
   1084  if proc.wait():
   1085    logging.critical('dump_syms exited with return code %d', proc.returncode)
   1086    sys.exit(proc.returncode)
   1087 
   1088  with open(args.output_path, 'wb') as f:
   1089    f.write(unwind_info)
   1090 
   1091  return 0
   1092 
   1093 
   1094 if __name__ == '__main__':
   1095  sys.exit(main())