tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dex_parser.py (17364B)


      1 #!/usr/bin/env python3
      2 # Copyright 2019 The Chromium Authors
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 """Utilities for optimistically parsing dex files.
      6 
      7 This file is not meant to provide a generic tool for analyzing dex files.
      8 A DexFile class that exposes access to several memory items in the dex format
      9 is provided, but it does not include error handling or validation.
     10 """
     11 
     12 import argparse
     13 import collections
     14 import errno
     15 import os
     16 import re
     17 import struct
     18 import sys
     19 import zipfile
     20 
     21 # https://source.android.com/devices/tech/dalvik/dex-format#header-item
     22 _DEX_HEADER_FMT = (
     23    ('magic', '8s'),
     24    ('checksum', 'I'),
     25    ('signature', '20s'),
     26    ('file_size', 'I'),
     27    ('header_size', 'I'),
     28    ('endian_tag', 'I'),
     29    ('link_size', 'I'),
     30    ('link_off', 'I'),
     31    ('map_off', 'I'),
     32    ('string_ids_size', 'I'),
     33    ('string_ids_off', 'I'),
     34    ('type_ids_size', 'I'),
     35    ('type_ids_off', 'I'),
     36    ('proto_ids_size', 'I'),
     37    ('proto_ids_off', 'I'),
     38    ('field_ids_size', 'I'),
     39    ('field_ids_off', 'I'),
     40    ('method_ids_size', 'I'),
     41    ('method_ids_off', 'I'),
     42    ('class_defs_size', 'I'),
     43    ('class_defs_off', 'I'),
     44    ('data_size', 'I'),
     45    ('data_off', 'I'),
     46 )
     47 
     48 DexHeader = collections.namedtuple('DexHeader',
     49                                   ','.join(t[0] for t in _DEX_HEADER_FMT))
     50 
     51 # Simple memory items.
     52 _TypeIdItem = collections.namedtuple('TypeIdItem', 'descriptor_idx')
     53 _ProtoIdItem = collections.namedtuple(
     54    'ProtoIdItem', 'shorty_idx,return_type_idx,parameters_off')
     55 _MethodIdItem = collections.namedtuple('MethodIdItem',
     56                                       'type_idx,proto_idx,name_idx')
     57 _TypeItem = collections.namedtuple('TypeItem', 'type_idx')
     58 _StringDataItem = collections.namedtuple('StringItem', 'utf16_size,data')
     59 _ClassDefItem = collections.namedtuple(
     60    'ClassDefItem',
     61    'class_idx,access_flags,superclass_idx,interfaces_off,source_file_idx,'
     62    'annotations_off,class_data_off,static_values_off')
     63 
     64 
     65 class _MemoryItemList:
     66  """Base class for repeated memory items."""
     67 
     68  def __init__(self,
     69               reader,
     70               offset,
     71               size,
     72               factory,
     73               alignment=None,
     74               first_item_offset=None):
     75    """Creates the item list using the specific item factory.
     76 
     77    Args:
     78      reader: _DexReader used for decoding the memory item.
     79      offset: Offset from start of the file to the item list, serving as the
     80        key for some item types.
     81      size: Number of memory items in the list.
     82      factory: Function to extract each memory item from a _DexReader.
     83      alignment: Optional integer specifying the alignment for the memory
     84        section represented by this list.
     85      first_item_offset: Optional, specifies a different offset to use for
     86        extracting memory items (default is to use offset).
     87    """
     88    self.offset = offset
     89    self.size = size
     90    reader.Seek(first_item_offset or offset)
     91    self._items = [factory(reader) for _ in range(size)]
     92 
     93    if alignment:
     94      reader.AlignUpTo(alignment)
     95 
     96  def __iter__(self):
     97    return iter(self._items)
     98 
     99  def __getitem__(self, key):
    100    return self._items[key]
    101 
    102  def __len__(self):
    103    return len(self._items)
    104 
    105  def __repr__(self):
    106    item_type_part = ''
    107    if self.size != 0:
    108      item_type = type(self._items[0])
    109      item_type_part = ', item type={}'.format(item_type.__name__)
    110 
    111    return '{}(offset={:#x}, size={}{})'.format(
    112        type(self).__name__, self.offset, self.size, item_type_part)
    113 
    114 
    115 class _TypeIdItemList(_MemoryItemList):
    116  def __init__(self, reader, offset, size):
    117    factory = lambda x: _TypeIdItem(x.ReadUInt())
    118    super().__init__(reader, offset, size, factory)
    119 
    120 
    121 class _ProtoIdItemList(_MemoryItemList):
    122  def __init__(self, reader, offset, size):
    123    factory = lambda x: _ProtoIdItem(x.ReadUInt(), x.ReadUInt(), x.ReadUInt())
    124    super().__init__(reader, offset, size, factory)
    125 
    126 
    127 class _MethodIdItemList(_MemoryItemList):
    128  def __init__(self, reader, offset, size):
    129    factory = (
    130        lambda x: _MethodIdItem(x.ReadUShort(), x.ReadUShort(), x.ReadUInt()))
    131    super().__init__(reader, offset, size, factory)
    132 
    133 
    134 class _StringItemList(_MemoryItemList):
    135  def __init__(self, reader, offset, size):
    136    reader.Seek(offset)
    137    string_item_offsets = iter([reader.ReadUInt() for _ in range(size)])
    138 
    139    def factory(x):
    140      data_offset = next(string_item_offsets)
    141      string = x.ReadString(data_offset)
    142      return _StringDataItem(len(string), string)
    143 
    144    super().__init__(reader, offset, size, factory)
    145 
    146 
    147 class _TypeListItem(_MemoryItemList):
    148  def __init__(self, reader):
    149    offset = reader.Tell()
    150    size = reader.ReadUInt()
    151    factory = lambda x: _TypeItem(x.ReadUShort())
    152    # This is necessary because we need to extract the size of the type list
    153    # (in other cases the list size is provided in the header).
    154    first_item_offset = reader.Tell()
    155    super().__init__(reader,
    156                     offset,
    157                     size,
    158                     factory,
    159                     alignment=4,
    160                     first_item_offset=first_item_offset)
    161 
    162 
    163 class _TypeListItemList(_MemoryItemList):
    164  def __init__(self, reader, offset, size):
    165    super().__init__(reader, offset, size, _TypeListItem)
    166 
    167 
    168 class _ClassDefItemList(_MemoryItemList):
    169  def __init__(self, reader, offset, size):
    170    reader.Seek(offset)
    171 
    172    def factory(x):
    173      return _ClassDefItem(*(x.ReadUInt()
    174                             for _ in range(len(_ClassDefItem._fields))))
    175 
    176    super().__init__(reader, offset, size, factory)
    177 
    178 
    179 class _DexMapItem:
    180  def __init__(self, reader):
    181    self.type = reader.ReadUShort()
    182    reader.ReadUShort()
    183    self.size = reader.ReadUInt()
    184    self.offset = reader.ReadUInt()
    185 
    186  def __repr__(self):
    187    return '_DexMapItem(type={}, size={}, offset={:#x})'.format(
    188        self.type, self.size, self.offset)
    189 
    190 
    191 class _DexMapList:
    192  # Full list of type codes:
    193  # https://source.android.com/devices/tech/dalvik/dex-format#type-codes
    194  TYPE_TYPE_LIST = 0x1001
    195 
    196  def __init__(self, reader, offset):
    197    self._map = {}
    198    reader.Seek(offset)
    199    self._size = reader.ReadUInt()
    200    for _ in range(self._size):
    201      item = _DexMapItem(reader)
    202      self._map[item.type] = item
    203 
    204  def __getitem__(self, key):
    205    return self._map[key]
    206 
    207  def __contains__(self, key):
    208    return key in self._map
    209 
    210  def __repr__(self):
    211    return '_DexMapList(size={}, items={})'.format(self._size, self._map)
    212 
    213 
    214 class _DexReader:
    215  def __init__(self, data):
    216    self._data = data
    217    self._pos = 0
    218 
    219  def Seek(self, offset):
    220    self._pos = offset
    221 
    222  def Tell(self):
    223    return self._pos
    224 
    225  def ReadUByte(self):
    226    return self._ReadData('<B')
    227 
    228  def ReadUShort(self):
    229    return self._ReadData('<H')
    230 
    231  def ReadUInt(self):
    232    return self._ReadData('<I')
    233 
    234  def ReadString(self, data_offset):
    235    string_length, string_offset = self._ReadULeb128(data_offset)
    236    string_data_offset = string_offset + data_offset
    237    return self._DecodeMUtf8(string_length, string_data_offset)
    238 
    239  def AlignUpTo(self, align_unit):
    240    off_by = self._pos % align_unit
    241    if off_by:
    242      self.Seek(self._pos + align_unit - off_by)
    243 
    244  def ReadHeader(self):
    245    header_fmt = '<' + ''.join(t[1] for t in _DEX_HEADER_FMT)
    246    return DexHeader._make(struct.unpack_from(header_fmt, self._data))
    247 
    248  def _ReadData(self, fmt):
    249    ret = struct.unpack_from(fmt, self._data, self._pos)[0]
    250    self._pos += struct.calcsize(fmt)
    251    return ret
    252 
    253  def _ReadULeb128(self, data_offset):
    254    """Returns a tuple of (uleb128 value, number of bytes occupied).
    255 
    256    From DWARF3 spec: http://dwarfstd.org/doc/Dwarf3.pdf
    257 
    258    Args:
    259      data_offset: Location of the unsigned LEB128.
    260    """
    261    value = 0
    262    shift = 0
    263    cur_offset = data_offset
    264    while True:
    265      byte = self._data[cur_offset]
    266      cur_offset += 1
    267      value |= (byte & 0b01111111) << shift
    268      if (byte & 0b10000000) == 0:
    269        break
    270      shift += 7
    271 
    272    return value, cur_offset - data_offset
    273 
    274  def _DecodeMUtf8(self, string_length, offset):
    275    """Returns the string located at the specified offset.
    276 
    277    See https://source.android.com/devices/tech/dalvik/dex-format#mutf-8
    278 
    279    Ported from the Android Java implementation:
    280    https://android.googlesource.com/platform/dalvik/+/fe107fb6e3f308ac5174ebdc5a794ee880c741d9/dx/src/com/android/dex/Mutf8.java#34
    281 
    282    Args:
    283      string_length: The length of the decoded string.
    284      offset: Offset to the beginning of the string.
    285    """
    286    self.Seek(offset)
    287    ret = ''
    288 
    289    for _ in range(string_length):
    290      a = self.ReadUByte()
    291      if a == 0:
    292        raise _MUTf8DecodeError('Early string termination encountered',
    293                                string_length, offset)
    294      if (a & 0x80) == 0x00:
    295        code = a
    296      elif (a & 0xe0) == 0xc0:
    297        b = self.ReadUByte()
    298        if (b & 0xc0) != 0x80:
    299          raise _MUTf8DecodeError('Error in byte 2', string_length, offset)
    300        code = ((a & 0x1f) << 6) | (b & 0x3f)
    301      elif (a & 0xf0) == 0xe0:
    302        b = self.ReadUByte()
    303        c = self.ReadUByte()
    304        if (b & 0xc0) != 0x80 or (c & 0xc0) != 0x80:
    305          raise _MUTf8DecodeError('Error in byte 3 or 4', string_length, offset)
    306        code = ((a & 0x0f) << 12) | ((b & 0x3f) << 6) | (c & 0x3f)
    307      else:
    308        raise _MUTf8DecodeError('Bad byte', string_length, offset)
    309      ret += chr(code)
    310 
    311    if self.ReadUByte() != 0x00:
    312      raise _MUTf8DecodeError('Expected string termination', string_length,
    313                              offset)
    314 
    315    return ret
    316 
    317 
    318 class _MUTf8DecodeError(Exception):
    319  def __init__(self, message, length, offset):
    320    message += ' (decoded string length: {}, string data offset: {:#x})'.format(
    321        length, offset)
    322    super().__init__(message)
    323 
    324 
    325 class DexFile:
    326  """Represents a single dex file.
    327 
    328  Parses and exposes access to dex file structure and contents, as described
    329  at https://source.android.com/devices/tech/dalvik/dex-format
    330 
    331  Fields:
    332    reader: _DexReader object used to decode dex file contents.
    333    header: DexHeader for this dex file.
    334    map_list: _DexMapList object containing list of dex file contents.
    335    type_item_list: _TypeIdItemList containing type_id_items.
    336    proto_item_list: _ProtoIdItemList containing proto_id_items.
    337    method_item_list: _MethodIdItemList containing method_id_items.
    338    string_item_list: _StringItemList containing string_data_items that are
    339      referenced by index in other sections.
    340    type_list_item_list: _TypeListItemList containing _TypeListItems.
    341      _TypeListItems are referenced by their offsets from other dex items.
    342    class_def_item_list: _ClassDefItemList containing _ClassDefItems.
    343  """
    344  _CLASS_ACCESS_FLAGS = {
    345      0x1: 'public',
    346      0x2: 'private',
    347      0x4: 'protected',
    348      0x8: 'static',
    349      0x10: 'final',
    350      0x200: 'interface',
    351      0x400: 'abstract',
    352      0x1000: 'synthetic',
    353      0x2000: 'annotation',
    354      0x4000: 'enum',
    355  }
    356 
    357  def __init__(self, data):
    358    """Decodes dex file memory sections.
    359 
    360    Args:
    361      data: bytearray containing the contents of a dex file.
    362    """
    363    self.reader = _DexReader(data)
    364    self.header = self.reader.ReadHeader()
    365    self.map_list = _DexMapList(self.reader, self.header.map_off)
    366    self.type_item_list = _TypeIdItemList(self.reader, self.header.type_ids_off,
    367                                          self.header.type_ids_size)
    368    self.proto_item_list = _ProtoIdItemList(self.reader,
    369                                            self.header.proto_ids_off,
    370                                            self.header.proto_ids_size)
    371    self.method_item_list = _MethodIdItemList(self.reader,
    372                                              self.header.method_ids_off,
    373                                              self.header.method_ids_size)
    374    self.string_item_list = _StringItemList(self.reader,
    375                                            self.header.string_ids_off,
    376                                            self.header.string_ids_size)
    377    self.class_def_item_list = _ClassDefItemList(self.reader,
    378                                                 self.header.class_defs_off,
    379                                                 self.header.class_defs_size)
    380 
    381    type_list_key = _DexMapList.TYPE_TYPE_LIST
    382    if type_list_key in self.map_list:
    383      map_list_item = self.map_list[type_list_key]
    384      self.type_list_item_list = _TypeListItemList(self.reader,
    385                                                   map_list_item.offset,
    386                                                   map_list_item.size)
    387    else:
    388      self.type_list_item_list = _TypeListItemList(self.reader, 0, 0)
    389    self._type_lists_by_offset = {
    390        type_list.offset: type_list
    391        for type_list in self.type_list_item_list
    392    }
    393 
    394  def GetString(self, string_item_idx):
    395    string_item = self.string_item_list[string_item_idx]
    396    return string_item.data
    397 
    398  def GetTypeString(self, type_item_idx):
    399    type_item = self.type_item_list[type_item_idx]
    400    return self.GetString(type_item.descriptor_idx)
    401 
    402  def GetTypeListStringsByOffset(self, offset):
    403    if not offset:
    404      return ()
    405    type_list = self._type_lists_by_offset[offset]
    406    return tuple(self.GetTypeString(item.type_idx) for item in type_list)
    407 
    408  @staticmethod
    409  def ResolveClassAccessFlags(access_flags):
    410    return tuple(flag_string
    411                 for flag, flag_string in DexFile._CLASS_ACCESS_FLAGS.items()
    412                 if flag & access_flags)
    413 
    414  def IterMethodSignatureParts(self):
    415    """Yields the string components of dex methods in a dex file.
    416 
    417    Yields:
    418      Tuples that look like:
    419        (class name, return type, method name, (parameter type, ...)).
    420    """
    421    for method_item in self.method_item_list:
    422      class_name_string = self.GetTypeString(method_item.type_idx)
    423      method_name_string = self.GetString(method_item.name_idx)
    424      proto_item = self.proto_item_list[method_item.proto_idx]
    425      return_type_string = self.GetTypeString(proto_item.return_type_idx)
    426      parameter_types = self.GetTypeListStringsByOffset(
    427          proto_item.parameters_off)
    428      yield (class_name_string, return_type_string, method_name_string,
    429             parameter_types)
    430 
    431  def __repr__(self):
    432    items = [
    433        self.header,
    434        self.map_list,
    435        self.type_item_list,
    436        self.proto_item_list,
    437        self.method_item_list,
    438        self.string_item_list,
    439        self.type_list_item_list,
    440        self.class_def_item_list,
    441    ]
    442    return '\n'.join(str(item) for item in items)
    443 
    444 
    445 class _DumpCommand:
    446  def __init__(self, dexfile):
    447    self._dexfile = dexfile
    448 
    449  def Run(self):
    450    raise NotImplementedError()
    451 
    452 
    453 class _DumpMethods(_DumpCommand):
    454  def Run(self):
    455    for parts in self._dexfile.IterMethodSignatureParts():
    456      class_type, return_type, method_name, parameter_types = parts
    457      print('{} {} (return type={}, parameters={})'.format(
    458          class_type, method_name, return_type, parameter_types))
    459 
    460 
    461 class _DumpStrings(_DumpCommand):
    462  def Run(self):
    463    for string_item in self._dexfile.string_item_list:
    464      # Some strings are likely to be non-ascii (vs. methods/classes).
    465      print(string_item.data.encode('utf-8'))
    466 
    467 
    468 class _DumpClasses(_DumpCommand):
    469  def Run(self):
    470    for class_item in self._dexfile.class_def_item_list:
    471      class_string = self._dexfile.GetTypeString(class_item.class_idx)
    472      superclass_string = self._dexfile.GetTypeString(class_item.superclass_idx)
    473      interfaces = self._dexfile.GetTypeListStringsByOffset(
    474          class_item.interfaces_off)
    475      access_flags = DexFile.ResolveClassAccessFlags(class_item.access_flags)
    476      print('{} (superclass={}, interfaces={}, access_flags={})'.format(
    477          class_string, superclass_string, interfaces, access_flags))
    478 
    479 
    480 class _DumpSummary(_DumpCommand):
    481  def Run(self):
    482    print(self._dexfile)
    483 
    484 
    485 def _DumpDexItems(dexfile_data, name, item):
    486  dexfile = DexFile(bytearray(dexfile_data))
    487  print('dex_parser: Dumping {} for {}'.format(item, name))
    488  cmds = {
    489      'summary': _DumpSummary,
    490      'methods': _DumpMethods,
    491      'strings': _DumpStrings,
    492      'classes': _DumpClasses,
    493  }
    494  try:
    495    cmds[item](dexfile).Run()
    496  except IOError as e:
    497    if e.errno == errno.EPIPE:
    498      # Assume we're piping to "less", do nothing.
    499      pass
    500 
    501 
    502 def main():
    503  parser = argparse.ArgumentParser(description='Dump dex contents to stdout.')
    504  parser.add_argument('input',
    505                      help='Input (.dex, .jar, .zip, .aab, .apk) file path.')
    506  parser.add_argument('item',
    507                      choices=('methods', 'strings', 'classes', 'summary'),
    508                      help='Item to dump',
    509                      nargs='?',
    510                      default='summary')
    511  args = parser.parse_args()
    512 
    513  if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip', '.aab'):
    514    with zipfile.ZipFile(args.input) as z:
    515      dex_file_paths = [
    516          f for f in z.namelist() if re.match(r'.*classes[0-9]*\.dex$', f)
    517      ]
    518      if not dex_file_paths:
    519        print('Error: {} does not contain any classes.dex files'.format(
    520            args.input))
    521        sys.exit(1)
    522 
    523      for path in dex_file_paths:
    524        _DumpDexItems(z.read(path), path, args.item)
    525 
    526  else:
    527    with open(args.input, 'rb') as f:
    528      _DumpDexItems(f.read(), args.input, args.item)
    529 
    530 
    531 if __name__ == '__main__':
    532  main()