tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

extract_unwind_tables.py (10376B)


      1 #!/usr/bin/env python3
      2 # Copyright 2018 The Chromium Authors
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Extracts the unwind tables in from breakpad symbol files
      7 
      8 Runs dump_syms on the given binary file and extracts the CFI data into the
      9 given output file.
     10 The output file is a binary file containing CFI rows ordered based on function
     11 address. The output file only contains rows that match the most popular rule
     12 type in CFI table, to reduce the output size and specify data in compact format.
     13 See doc https://github.com/google/breakpad/blob/main/docs/symbol_files.md.
     14 1. The CFA rules should be of postfix form "SP <val> +".
     15 2. The RA rules should be of postfix form "CFA <val> + ^".
     16 Note: breakpad represents dereferencing address with '^' operator.
     17 
     18 The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI
     19 format. The first table contains function addresses and an index into the
     20 UNW_DATA table. The second table contains one or more rows for the function
     21 unwind information.
     22 
     23 The output file starts with 4 bytes counting the number of entries in UNW_INDEX.
     24 Then UNW_INDEX table and UNW_DATA table.
     25 
     26 UNW_INDEX contains two columns of N rows each, where N is the number of
     27 functions.
     28  1. First column 4 byte rows of all the function start address as offset from
     29     start of the binary, in sorted order.
     30  2. For each function addr, the second column contains 2 byte indices in order.
     31     The indices are offsets (in count of 2 bytes) of the CFI data from start of
     32     UNW_DATA.
     33 The last entry in the table always contains CANT_UNWIND index to specify the
     34 end address of the last function.
     35 
     36 UNW_DATA contains data of all the functions. Each function data contains N rows.
     37 The data found at the address pointed from UNW_INDEX will be:
     38  2 bytes: N - number of rows that belong to current function.
     39  N * 4 bytes: N rows of data. 16 bits : Address offset from function start.
     40                               14 bits : CFA offset / 4.
     41                                2 bits : RA offset / 4.
     42 
     43 The function is not added to the unwind table in following conditions:
     44 C1. If length of the function code (number of instructions) is greater than
     45    0xFFFF (2 byte address span). This is because we use 16 bits to refer to
     46    offset of instruction from start of the address.
     47 C2. If the function moves the SP by more than 0xFFFF bytes. This is because we
     48    use 14 bits to denote CFA offset (last 2 bits are 0).
     49 C3. If the Return Address is stored at an offset >= 16 from the CFA. Some
     50    functions which have variable arguments can have offset upto 16.
     51    TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since
     52    we never have 0.
     53 C4: Some functions do not have unwind information defined in dwarf info. These
     54    functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table.
     55 
     56 
     57 Usage:
     58  extract_unwind_tables.py --input_path [root path to unstripped chrome.so]
     59      --output_path [output path] --dump_syms_path [path to dump_syms binary]
     60 """
     61 
     62 import argparse
     63 import re
     64 import struct
     65 import subprocess
     66 import sys
     67 import tempfile
     68 
     69 
     70 _CFA_REG = '.cfa'
     71 _RA_REG = '.ra'
     72 
     73 _ADDR_ENTRY = 0
     74 _LENGTH_ENTRY = 1
     75 
     76 _CANT_UNWIND = 0xFFFF
     77 
     78 
     79 def _Write4Bytes(output_file, val):
     80  """Writes a 32 bit unsigned integer to the given output file."""
     81  output_file.write(struct.pack('<L', val));
     82 
     83 
     84 def _Write2Bytes(output_file, val):
     85  """Writes a 16 bit unsigned integer to the given output file."""
     86  output_file.write(struct.pack('<H', val));
     87 
     88 
     89 def _FindRuleForRegister(cfi_row, reg):
     90  """Returns the postfix expression as string for a given register.
     91 
     92  Breakpad CFI row format specifies rules for unwinding each register in postfix
     93  expression form separated by space. Each rule starts with register name and a
     94  colon. Eg: "CFI R1: <rule> R2: <rule>".
     95  """
     96  out = []
     97  found_register = False
     98  for part in cfi_row:
     99    if found_register:
    100      if part[-1] == ':':
    101        break
    102      out.append(part)
    103    elif part == reg + ':':
    104      found_register = True
    105  return ' '.join(out)
    106 
    107 
    108 def _GetCfaAndRaOffset(cfi_row):
    109  """Returns a tuple with 2 numbers (cfa_offset, ra_offset).
    110 
    111  Returns right values if rule matches the predefined criteria. Returns (0, 0)
    112  otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule
    113  is postfix form "CFA -<val> + ^".
    114  """
    115  cfa_offset = 0
    116  ra_offset = 0
    117  cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG)
    118  ra_rule = _FindRuleForRegister(cfi_row, _RA_REG)
    119  if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule):
    120    cfa_offset = int(cfa_rule.split()[1], 10)
    121  if ra_rule:
    122    if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule):
    123      return (0, 0)
    124    ra_offset = -1 * int(ra_rule.split()[1], 10)
    125  return (cfa_offset, ra_offset)
    126 
    127 
    128 def _GetAllCfiRows(symbol_file):
    129  """Returns parsed CFI data from given symbol_file.
    130 
    131  Each entry in the cfi data dictionary returned is a map from function start
    132  address to array of function rows, starting with FUNCTION type, followed by
    133  one or more CFI rows.
    134  """
    135  cfi_data = {}
    136  current_func = []
    137  for line in symbol_file:
    138    line = line.decode('utf8')
    139    if 'STACK CFI' not in line:
    140      continue
    141 
    142    parts = line.split()
    143    data = {}
    144    if parts[2] == 'INIT':
    145      # Add the previous function to the output
    146      if len(current_func) > 1:
    147        cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
    148      current_func = []
    149 
    150      # The function line is of format "STACK CFI INIT <addr> <length> ..."
    151      data[_ADDR_ENTRY] = int(parts[3], 16)
    152      data[_LENGTH_ENTRY] = int(parts[4], 16)
    153 
    154      # Condition C1: Skip if length is large.
    155      if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff:
    156        continue  # Skip the current function.
    157    else:
    158      # The current function is skipped.
    159      if len(current_func) == 0:
    160        continue
    161 
    162      # The CFI row is of format "STACK CFI <addr> .cfa: <expr> .ra: <expr> ..."
    163      data[_ADDR_ENTRY] = int(parts[2], 16)
    164      (data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts)
    165 
    166      # Condition C2 and C3: Skip based on limits on offsets.
    167      if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff:
    168        current_func = []
    169        continue
    170      assert data[_CFA_REG] % 4 == 0
    171      # Since we skipped functions with code size larger than 0xffff, we should
    172      # have no function offset larger than the same value.
    173      assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff
    174 
    175    if data[_ADDR_ENTRY] == 0:
    176      # Skip current function, delete all previous entries.
    177      current_func = []
    178      continue
    179    assert data[_ADDR_ENTRY] % 2 == 0
    180    current_func.append(data)
    181 
    182  # Condition C4: Skip function without CFI rows.
    183  if len(current_func) > 1:
    184    cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
    185  return cfi_data
    186 
    187 
    188 def _WriteCfiData(cfi_data, out_file):
    189  """Writes the CFI data in defined format to out_file."""
    190  # Stores the final data that will be written to UNW_DATA table, in order
    191  # with 2 byte items.
    192  unw_data = []
    193 
    194  # Represent all the CFI data of functions as set of numbers and map them to an
    195  # index in the |unw_data|. This index is later written to the UNW_INDEX table
    196  # for each function. This map is used to find index of the data for functions.
    197  data_to_index = {}
    198  # Store mapping between the functions to the index.
    199  func_addr_to_index = {}
    200  previous_func_end = 0
    201  for addr, function in sorted(cfi_data.items()):
    202    # Add an empty function entry when functions CFIs are missing between 2
    203    # functions.
    204    if previous_func_end != 0 and addr - previous_func_end  > 4:
    205      func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
    206    previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY]
    207 
    208    assert len(function) > 1
    209    func_data_arr = []
    210    func_data = 0
    211    # The first row contains the function address and length. The rest of the
    212    # rows have CFI data. Create function data array as given in the format.
    213    for row in function[1:]:
    214      addr_offset = row[_ADDR_ENTRY] - addr
    215      cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] // 4)
    216 
    217      func_data_arr.append(addr_offset)
    218      func_data_arr.append(cfa_offset)
    219 
    220    # Consider all the rows in the data as one large integer and add it as a key
    221    # to the |data_to_index|.
    222    for data in func_data_arr:
    223      func_data = (func_data << 16) | data
    224 
    225    row_count = len(func_data_arr) // 2
    226    if func_data not in data_to_index:
    227      # When data is not found, create a new index = len(unw_data), and write
    228      # the data to |unw_data|.
    229      index = len(unw_data)
    230      data_to_index[func_data] = index
    231      unw_data.append(row_count)
    232      for row in func_data_arr:
    233        unw_data.append(row)
    234    else:
    235      # If the data was found, then use the same index for the function.
    236      index = data_to_index[func_data]
    237      assert row_count == unw_data[index]
    238    func_addr_to_index[addr] = data_to_index[func_data]
    239 
    240  # Mark the end end of last function entry.
    241  func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
    242 
    243  # Write the size of UNW_INDEX file in bytes.
    244  _Write4Bytes(out_file, len(func_addr_to_index))
    245 
    246  # Write the UNW_INDEX table. First list of addresses and then indices.
    247  sorted_unw_index = sorted(func_addr_to_index.items())
    248  for addr, index in sorted_unw_index:
    249    _Write4Bytes(out_file, addr)
    250  for addr, index in sorted_unw_index:
    251    _Write2Bytes(out_file, index)
    252 
    253  # Write the UNW_DATA table.
    254  for data in unw_data:
    255    _Write2Bytes(out_file, data)
    256 
    257 
    258 def main():
    259  parser = argparse.ArgumentParser()
    260  parser.add_argument(
    261      '--input_path', required=True,
    262      help='The input path of the unstripped binary')
    263  parser.add_argument(
    264      '--output_path', required=True,
    265      help='The path of the output file')
    266  parser.add_argument(
    267      '--dump_syms_path', required=True,
    268      help='The path of the dump_syms binary')
    269 
    270  args = parser.parse_args()
    271  cmd = ['./' + args.dump_syms_path, args.input_path, '-v']
    272  proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    273  cfi_data = _GetAllCfiRows(proc.stdout)
    274  if proc.wait():
    275    sys.stderr.write('dump_syms exited with code {} after {} symbols\n'.format(
    276        proc.returncode, len(cfi_data)))
    277    sys.exit(proc.returncode)
    278  with open(args.output_path, 'wb') as out_file:
    279    _WriteCfiData(cfi_data, out_file)
    280 
    281 
    282 if __name__ == '__main__':
    283  main()